In [14]:
import pandas as pd
import itertools
import random
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
mlflow.set_tracking_uri("file:./mlruns")

# MLflow Experiment erstellen
experiment_name = "income_prediction"
experiment = mlflow.set_experiment(experiment_name)

# Experiment-Details ausgeben
print(f"Experiment Name: {experiment_name}")
print(f"Experiment ID: {experiment.experiment_id}")

2025/03/06 10:00:11 INFO mlflow.tracking.fluent: Experiment with name 'income_prediction' does not exist. Creating a new experiment.


Experiment Name: income_prediction
Experiment ID: 277431885630747115


In [16]:
# Needs to be moved to /mlflow/data/processed/
data = pd.read_csv('../mlflow/data/processed/processed_data.csv')

In [18]:
X = data.drop(columns=['income >50K'])
y = data['income >50K']

In [19]:
# Numerische und kategoriale Spalten identifizieren
num_features = ['age', 'educational-num', 'hours-per-week']
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship']

# Transformationen für numerische Spalten (Skalierung)
num_transformer = StandardScaler()

# Transformationen für kategoriale Spalten (One-Hot-Encoding)
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing-Pipeline
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])


In [20]:
# **Hyperparameter-Sets definieren**
param_grid_rf = {
    "n_estimators": [100, 200],  
    "max_depth": [5, 10],        
    "min_samples_split": [2, 5], 
    "min_samples_leaf": [1, 3]   
}

param_combinations_rf = list(itertools.product(*param_grid_rf.values()))

param_grid_lr = {
    "penalty": ["l1", "l2", "elasticnet", None],  
    "C": [0.01, 0.1, 1, 10, 100],  
    "solver": ["liblinear", "lbfgs", "saga"],  
    "max_iter": [100, 200, 500]  
}

param_combinations_lr = list(itertools.product(*param_grid_lr.values()))

param_grid_gb = {
    "n_estimators": [100, 200],  
    "learning_rate": [0.01, 0.1, 0.2],  
    "max_depth": [3, 5, 10],  
    "min_samples_split": [2, 5],  
    "min_samples_leaf": [1, 3]  
}

param_combinations_gb = list(itertools.product(*param_grid_gb.values()))


def is_valid_lr_params(params):
    """Überprüft, ob die Hyperparameter für LogisticRegression gültig sind."""
    penalty, C, solver, max_iter = params

    solver_penalty_map = {
        "lbfgs": ["l2", None],
        "liblinear": ["l1", "l2"],
        "saga": ["l1", "l2", "elasticnet", None]
    }

    # Prüfen, ob der Solver den Penalty-Typ unterstützt
    if solver not in solver_penalty_map or penalty not in solver_penalty_map[solver]:
        return False  

    # Falls `penalty="elasticnet"`, muss `l1_ratio` explizit definiert sein
    if penalty == "elasticnet" and "l1_ratio" not in param_grid_lr:
        return False  

    return True



# **Kompatible Kombinationen filtern**
param_combinations_lr = [params for params in param_combinations_lr if is_valid_lr_params(params)]


def select_diverse_combinations(param_combinations, max_combinations=20):
    """Wählt eine diverse Menge an Hyperparameter-Kombinationen aus."""
    if len(param_combinations) <= max_combinations:
        return param_combinations  # Falls bereits <= 20, einfach zurückgeben
    
    # Zufällige Auswahl mit gleichmäßiger Verteilung
    selected = set()
    while len(selected) < max_combinations:
        candidate = random.choice(param_combinations)
        selected.add(candidate)  # Set verhindert doppelte Einträge
    
    return list(selected)


# **Maximal 20 Kombinationen pro Modell**
param_combinations_rf = select_diverse_combinations(param_combinations_rf, 20)
param_combinations_lr = select_diverse_combinations(param_combinations_lr, 20)
param_combinations_gb = select_diverse_combinations(param_combinations_gb, 20)

In [21]:
len(param_combinations_lr)

20

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelle mit ihren Parametern durchlaufen
model_configs = [
    ("random_forest", RandomForestClassifier, param_combinations_rf, param_grid_rf),
    ("logistic_regression", LogisticRegression, param_combinations_lr, param_grid_lr),
    ("gradient_boosting", GradientBoostingClassifier, param_combinations_gb, param_grid_gb)
]

In [23]:
for model_type, model_class, param_combinations, param_grid in model_configs:
    for params in param_combinations:
        # Dictionary mit aktuellen Parametern
        current_params = dict(zip(param_grid.keys(), params))

        with mlflow.start_run():
            print(f"Training {model_type} mit Parametern: {current_params}")

            # Modell initialisieren
            model = model_class(**current_params, random_state=42) if "random_state" in model_class().get_params() else model_class(**current_params)

            # Pipeline mit Preprocessing und Modell
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', model)
            ])

            # Modell trainieren
            pipeline.fit(X_train, y_train)

            # Vorhersagen & Accuracy berechnen
            y_pred = pipeline.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            # Parameter & Modelltyp loggen
            mlflow.log_params(current_params)
            mlflow.log_param("model_type", model_type)

            # Metriken loggen
            mlflow.log_metric("accuracy", accuracy)

            # Modell speichern
            mlflow.sklearn.log_model(pipeline, f"{model_type}_pipeline")

Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training random_forest mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 1, 'solver': 'liblinear', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 0.1, 'solver': 'lbfgs', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 100, 'solver': 'saga', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 100, 'solver': 'liblinear', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 0.1, 'solver': 'saga', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 0.01, 'solver': 'lbfgs', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': 'l1', 'C': 0.01, 'solver': 'saga', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 100, 'solver': 'lbfgs', 'max_iter': 500}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 1, 'solver': 'lbfgs', 'max_iter': 500}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 0.01, 'solver': 'saga', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 0.01, 'solver': 'saga', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 1, 'solver': 'liblinear', 'max_iter': 500}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 0.1, 'solver': 'lbfgs', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': 'l1', 'C': 0.1, 'solver': 'liblinear', 'max_iter': 200}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 0.1, 'solver': 'liblinear', 'max_iter': 500}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 100, 'solver': 'saga', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 100, 'solver': 'liblinear', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': 'l2', 'C': 0.1, 'solver': 'liblinear', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': None, 'C': 1, 'solver': 'saga', 'max_iter': 100}




Training logistic_regression mit Parametern: {'penalty': 'l1', 'C': 0.01, 'solver': 'liblinear', 'max_iter': 500}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.2, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.2, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.2, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.2, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.2, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training gradient_boosting mit Parametern: {'n_estimators': 200, 'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training gradient_boosting mit Parametern: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




In [24]:
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
best_run = runs.loc[runs["metrics.accuracy"].idxmax()]

model_type = best_run["params.model_type"]

In [25]:
print(f"Bester Run: {best_run['run_id']}")
print(f"Model Type: {best_run['params.model_type']}")
print(f"Accuracy: {best_run['metrics.accuracy']}")

Bester Run: 404a9b1ebfd445e89390327f16266858
Model Type: gradient_boosting
Accuracy: 0.8391376451077943


In [26]:
model_uri = f"runs:/{best_run['run_id']}/{best_run['params.model_type']}_pipeline"

In [None]:
mlflow.sklearn.save_model(
        sk_model=mlflow.sklearn.load_model(model_uri),
        path="best_model"
    )

### Model serven mit:

mlflow models serve -m best_model --port 5000 --no-conda

In [39]:
model = mlflow.pyfunc.load_model(model_uri)

test_cases = [
    {
        "age": 28,
        "workclass": "Government",
        "educational-num": 12,
        "marital-status": "Married",
        "occupation": "Public Safety",
        "relationship": "Husband",
        "hours-per-week": 40,
        "is_Male": 1,
        "is_White": 1,
        "from_USA": 1,
        "gained-capital": 0
    },
    {
        "age": 45,
        "workclass": "Private",
        "educational-num": 16,
        "marital-status": "Single",
        "occupation": "Professional",
        "relationship": "Not-in-family",
        "hours-per-week": 50,
        "is_Male": 0,
        "is_White": 1,
        "from_USA": 1,
        "gained-capital": 1
    }]

test_df = pd.DataFrame(test_cases)
predictions = model.predict(test_df)


In [40]:
predictions

array([0, 1], dtype=int64)