In [12]:
import pandas as pd
import numpy as np

# Anzeigeoptionen anpassen
pd.set_option('display.max_rows', None)    # zeigt alle Zeilen
pd.set_option('display.max_columns', None) # zeigt alle Spalten
pd.set_option('display.width', None)       # keine Begrenzung der Zeilenbreite
pd.set_option('display.max_colwidth', None)  # keine Begrenzung der Spaltenbreite


## Fitting Test XGB RF LM

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
# Train Test Split
# importiere df_preprocessed.csv
df_extended = pd.read_csv('data/df_preprocessed.csv')

# 1) Features und Ziel definieren
X = df_extended.drop(columns=['match'])
y = df_extended['match']

# 2) Split mit Gruppen
gss = GroupShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)
train_idx, test_idx = next(
    gss.split(X, y, groups=df_extended['pair_index'])
)

# 3) Train/Test aufteilen
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

- pf_o: Preference of Partner 
- attr1_1: Preference Subject
- 
- attr_o: Rating of Partner 
- attr: Rating Partner
- 
- attr2_1: Perceived Preference Partner 
- attr3_1: Perceived Ratings



In [17]:
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline       import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import CountEncoder
from sklearn.svm import SVC
from xgboost                import XGBClassifier
from sklearn.linear_model   import LogisticRegression
from sklearn.ensemble       import RandomForestClassifier
import joblib
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)



#Funktion zum Erzeugen des passenden Preprocessors
def make_preprocessor(model_name, numeric_features, categorical_features):
    cat_encoder = encoding_strategies[model_name]
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", cat_encoder)
    ])
    return ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ])

# Numerische und kategoriale Features ermitteln (unverändert)
numeric_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Transformer für numerische und kategoriale Daten definieren (unverändert)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),      # fehlende Werte durch Median ersetzen
    ("scaler", StandardScaler())                        # Standardisierung
])

# 2) Encoding-Strategien je Modell festlegen
encoding_strategies = {
    "LogisticRegression":TargetEncoder(handle_unknown="ignore"),
    "RandomForest":      OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    "XGBoost":           OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    "KNN":               CountEncoder(handle_unknown="ignore"),
    "SVM":               TargetEncoder(handle_unknown="ignore")
}

models = [
    ("LogisticRegression", LogisticRegression(random_state=42)),
    ("RandomForest",       RandomForestClassifier(max_features= 3,random_state=42)),
    ("XGBoost",            XGBClassifier(random_state=42)),
    ("KNN",                KNeighborsClassifier()),
    ("SVM",                SVC(probability=True, random_state=42))
]

scoring = {
    "Accuracy":       "accuracy",
    "ROC_AUC":        "roc_auc_ovo_weighted",
    "F1":             "f1_macro"
}

for name, clf in models:
    pipe = Pipeline([
        ("preprocessor", make_preprocessor(name, numeric_features, categorical_features)),
        ("classifier",   clf)
    ])
    pipe.fit(X_train, y_train)

    # 2) Vorhersagen
    y_pred  = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] 
    
    # 3) Metriken berechnen
    acc   = accuracy_score(y_test, y_pred)
    f1    = f1_score(y_test, y_pred)
    roc   = roc_auc_score(y_test, y_proba)      # für Binary nur y_proba[:,1]
    
    # 4) Ausgabe
    print(f"\n=== {name} ===")
    print(f"Accuracy   : {acc:.3f}")
    print(f"F1-Score   : {f1:.3f}")
    print(f"ROC AUC    : {roc:.3f}")

    
    # 5) Modell speichern
    filename = f"model_{name}.pkl"
    joblib.dump(pipe, filename)


=== LogisticRegression ===
Accuracy   : 0.872
F1-Score   : 0.461
ROC AUC    : 0.850

=== RandomForest ===
Accuracy   : 0.875
F1-Score   : 0.361
ROC AUC    : 0.847

=== XGBoost ===
Accuracy   : 0.884
F1-Score   : 0.548
ROC AUC    : 0.867

=== KNN ===
Accuracy   : 0.850
F1-Score   : 0.326
ROC AUC    : 0.718

=== SVM ===
Accuracy   : 0.878
F1-Score   : 0.433
ROC AUC    : 0.866


CV für Hyperparameter und robuste unbiased Schätzung

In [None]:
    cv_results = cross_validate(
        pipe,
        X_train, y_train,
        cv=5,
        scoring=scoring,
        return_train_score=False
    )
    mean_scores = {}
    for metric in scoring:
        m = cv_results[f"test_{metric}"]
        mean_scores[metric] = m.mean()
        print(f"  {metric:15s}: {m.mean():.3f} ± {m.std():.3f}")

    overall_avg = np.mean(list(mean_scores.values()))
    print(f"  {'Overall average':15s}: {overall_avg:.3f}")

## RF Model/Feature Importance

Sparse Group Lasso