In [38]:
import pandas as pd
import itertools
import random
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
mlflow.set_tracking_uri("file:./mlruns")

# MLflow Experiment erstellen
experiment_name = "income_prediction"
experiment = mlflow.set_experiment(experiment_name)

# Experiment-Details ausgeben
print(f"Experiment Name: {experiment_name}")
print(f"Experiment ID: {experiment.experiment_id}")

Experiment Name: income_prediction
Experiment ID: 939509627903992053


In [3]:
data = pd.read_csv('../data/processed/processed_data.csv')

In [4]:
data

Unnamed: 0,age,workclass,educational-num,marital-status,occupation,relationship,hours-per-week,is_Male,income >50K,is_White,from_USA,gained-capital
0,25,Private,7,Never-married,Simple Services,Child,40,1,0,0,1,0
1,38,Private,9,Married,Professional,Husband,50,1,0,1,1,0
2,28,Government,12,Married,Public Safety,Husband,40,1,1,1,1,0
3,44,Private,10,Married,Simple Services,Husband,40,1,1,0,1,1
4,34,Private,6,Never-married,Simple Services,Shared Housing,30,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45217,27,Private,12,Married,Specialized Services,Wife,38,0,0,1,1,0
45218,40,Private,9,Married,Simple Services,Husband,40,1,1,1,1,0
45219,58,Private,9,Widowed/Separated,Administrative,Single,40,0,0,1,1,0
45220,22,Private,9,Never-married,Administrative,Child,20,1,0,1,1,0


In [5]:
X = data.drop(columns=['income >50K'])
y = data['income >50K']

In [6]:
# Numerische und kategoriale Spalten identifizieren
num_features = ['age', 'educational-num', 'hours-per-week']
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship']

# Transformationen für numerische Spalten (Skalierung)
num_transformer = StandardScaler()

# Transformationen für kategoriale Spalten (One-Hot-Encoding)
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing-Pipeline
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])


In [25]:
# **Hyperparameter-Sets definieren**
param_grid_rf = {
    "n_estimators": [100, 200],  
    "max_depth": [5, 10],        
    "min_samples_split": [2, 5], 
    "min_samples_leaf": [1, 3]   
}

param_combinations_rf = list(itertools.product(*param_grid_rf.values()))

param_grid_lr = {
    "penalty": ["l1", "l2", "elasticnet", None],  
    "C": [0.01, 0.1, 1, 10, 100],  
    "solver": ["liblinear", "lbfgs", "saga"],  
    "max_iter": [100, 200, 500]  
}

param_combinations_lr = list(itertools.product(*param_grid_lr.values()))

param_grid_gb = {
    "n_estimators": [100, 200],  
    "learning_rate": [0.01, 0.1, 0.2],  
    "max_depth": [3, 5, 10],  
    "min_samples_split": [2, 5],  
    "min_samples_leaf": [1, 3]  
}

param_combinations_gb = list(itertools.product(*param_grid_gb.values()))


def is_valid_lr_params(params):
    """Überprüft, ob die Hyperparameter für LogisticRegression gültig sind."""
    penalty, C, solver, max_iter = params

    solver_penalty_map = {
        "lbfgs": ["l2", None],
        "liblinear": ["l1", "l2"],
        "saga": ["l1", "l2", "elasticnet", None]
    }

    # Prüfen, ob der Solver den Penalty-Typ unterstützt
    if solver not in solver_penalty_map or penalty not in solver_penalty_map[solver]:
        return False  

    # Falls `penalty="elasticnet"`, muss `l1_ratio` explizit definiert sein
    if penalty == "elasticnet" and "l1_ratio" not in param_grid_lr:
        return False  

    return True



# **Kompatible Kombinationen filtern**
param_combinations_lr = [params for params in param_combinations_lr if is_valid_lr_params(params)]


def select_diverse_combinations(param_combinations, max_combinations=20):
    """Wählt eine diverse Menge an Hyperparameter-Kombinationen aus."""
    if len(param_combinations) <= max_combinations:
        return param_combinations  # Falls bereits <= 20, einfach zurückgeben
    
    # Zufällige Auswahl mit gleichmäßiger Verteilung
    selected = set()
    while len(selected) < max_combinations:
        candidate = random.choice(param_combinations)
        selected.add(candidate)  # Set verhindert doppelte Einträge
    
    return list(selected)


# **Maximal 20 Kombinationen pro Modell**
param_combinations_rf = select_diverse_combinations(param_combinations_rf, 20)
param_combinations_lr = select_diverse_combinations(param_combinations_lr, 20)
param_combinations_gb = select_diverse_combinations(param_combinations_gb, 20)

In [22]:
len(param_combinations_lr)

20

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelle mit ihren Parametern durchlaufen
model_configs = [
    ("random_forest", RandomForestClassifier, param_combinations_rf, param_grid_rf),
    ("logistic_regression", LogisticRegression, param_combinations_lr, param_grid_lr),
    ("gradient_boosting", GradientBoostingClassifier, param_combinations_gb, param_grid_gb)
]

In [None]:
for model_type, model_class, param_combinations, param_grid in model_configs:
    for params in param_combinations:
        # Dictionary mit aktuellen Parametern
        current_params = dict(zip(param_grid.keys(), params))

        with mlflow.start_run():
            print(f"Training {model_type} mit Parametern: {current_params}")

            # Modell initialisieren
            model = model_class(**current_params, random_state=42) if "random_state" in model_class().get_params() else model_class(**current_params)

            # Pipeline mit Preprocessing und Modell
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', model)
            ])

            # Modell trainieren
            pipeline.fit(X_train, y_train)

            # Vorhersagen & Accuracy berechnen
            y_pred = pipeline.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            # Parameter & Modelltyp loggen
            mlflow.log_params(current_params)
            mlflow.log_param("model_type", model_type)

            # Metriken loggen
            mlflow.log_metric("accuracy", accuracy)

            # Modell speichern
            mlflow.sklearn.log_model(pipeline, f"{model_type}_pipeline")

In [None]:
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
best_run = runs.loc[runs["metrics.accuracy"].idxmax()]

model_type = best_run["params.model_type"]

In [None]:
print(f"Bester Run: {best_run['run_id']}")
print(f"Model Type: {best_run['params.model_type']}")
print(f"Accuracy: {best_run['metrics.accuracy']}")

In [53]:
model_uri = f"runs:/{best_run['run_id']}/{best_run['params.model_type']}_pipeline"

In [54]:
mlflow.sklearn.save_model(
        sk_model=mlflow.sklearn.load_model(model_uri),
        path="best_model"
    )

### Model serven mit:

mlflow models serve -m best_model --port 5000