## Fitting Test XGB RF LM

In [None]:
import pandas as pd
import numpy as np

# Anzeigeoptionen anpassen
pd.set_option('display.max_rows', None)    # zeigt alle Zeilen
pd.set_option('display.max_columns', None) # zeigt alle Spalten
pd.set_option('display.width', None)       # keine Begrenzung der Zeilenbreite
pd.set_option('display.max_colwidth', None)  # keine Begrenzung der Spaltenbreite


- pf_o: Preference of Partner 
- attr1_1: Preference Subject
- 
- attr_o: Rating of Partner 
- attr: Rating Partner
- 
- attr2_1: Perceived Preference Partner 
- attr3_1: Perceived Ratings



In [40]:
from category_encoders import TargetEncoder 
from sklearn.preprocessing import StandardScaler
# importiere df_preprocessed.csv
df_extended = pd.read_csv('data/df_preprocessed.csv')
df_pre = df_extended.copy()

# Target Encoding für kategorische Spalten
categorical_cols = df_pre.select_dtypes(include=['object']).columns.tolist()
target_encoder = TargetEncoder(cols=categorical_cols, handle_unknown='value', handle_missing='value')
df_pre[categorical_cols] = target_encoder.fit_transform(df_pre[categorical_cols], df_pre['match'])   
# Median Imputation für numerische Spalten
numerical_cols = df_pre.select_dtypes(include=[np.number]).columns.tolist()
df_pre[numerical_cols] = df_pre[numerical_cols].fillna(df_pre[numerical_cols].median())

#Scale
X_scaled_arr = StandardScaler().fit_transform(df_pre)
X_scaled = pd.DataFrame(X_scaled_arr, columns=df_pre.columns, index=df_pre.index)

# Train Test Split and Scaling
X_pca = X_scaled.drop(["match"], axis=1)
y_pca = X_scaled["match"]


In [None]:
groups_map = {
    "gerneral": ["gender", "condtn", "wave", "round", "position", "order", "samerace", "age_o", "race_o", "age", "field", "field_cd","race","from", "goal", "date", "go_out",
                 "career", "career_c","met", "met_o", "exphappy"],
    "interests": ["int_corr", "sports", "tvsports", "exercise", "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater",
                  "movies", "concerts", "music", "shopping", "yoga"],
    "preferences": ["pf_o_att", "pf_o_sin", "pf_o_int", "pf_o_fun", "pf_o_amb", "pf_o_sha","imprace", "imprelig", "attr1_1", "sinc1_1", "intel1_1", "fun1_1", "amb1_1", "shar1_1"],
    "ratings": ["attr_o", "sinc_o", "intel_o", "fun_o", "amb_o", "shar_o","like_o", "prob_o","attr", "sinc", "intel", "fun", "like", "prob"],
    "selfassesment": ["attr2_1", "sinc2_1", "intel2_1", "fun2_1", "amb2_1", "shar2_1", "attr3_1", "sinc3_1", "intel3_1", "fun3_1", "amb3_1", "shar3_1"]
}

group_ids = []
for col in X_pca.columns:
    gid = next(
        (i+1 for i, cols in enumerate(groups_map.values()) if col in cols),
        -1
    )
    group_ids.append(gid)
group_ids = np.array(group_ids)
print("Gruppen IDs:", group_ids)


Gruppen IDs: [1 1 1 1 1 1 2 1 1 1 3 3 3 3 3 3 4 4 4 4 4 4 1 1 1 1 1 3 3 1 1 1 1 1 1 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 5 4 4 4 4
 4 4 1]


In [42]:
from group_lasso import GroupLasso
from sklearn.model_selection import train_test_split
gl = GroupLasso(
    groups=group_ids,
    group_reg=0.1,   # λ₂
    l1_reg=0.001,      # λ₁
    n_iter=5000,
    tol=1e-03,
    supress_warning=True# ← altes Scaling durch Eigenwert wieder einschalten
)
gl.fit(X_pca, y_pca)
β = gl.coef_



#print("Koeffizienten:", gl.coef_)

# Maske der ausgewählten Features
mask = gl.sparsity_mask_
#print("Ausgewählte Features (Maske):", mask)

# Indizes der ausgewählten Features
selected_idx = np.where(mask)[0]
print("Indizes ausgewählter Features:", selected_idx)

# Falls X ein DataFrame ist, kannst du auch direkt die Spaltennamen holen:
# selected_cols = X.columns[selected_idx]
# print("Ausgewählte Spalten:", selected_cols)

# IDs der ausgewählten Gruppen
#print("Ausgewählte Gruppen:", gl.chosen_groups_)

X = X_pca.iloc[:, selected_idx]
y = df_extended["match"]
# mache mir einen train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Indizes ausgewählter Features: [16 17 18 19 20 21 70 71 72 73 74 75]


In [43]:
X_train.head()

Unnamed: 0,attr_o,sinc_o,intel_o,fun_o,like_o,prob_o,attr,sinc,intel,fun,like,prob
5677,0.42294,1.654736,-0.889989,-0.222596,-0.623334,0.382829,-2.172415,-2.436115,-0.889343,-1.265928,-1.173853,-0.574056
664,0.942262,-0.098938,0.42286,0.821586,1.030825,-0.095914,-0.096238,0.485326,1.078848,-0.744033,-0.622812,0.861241
4366,-1.135028,-0.098938,-0.233564,0.299495,0.479439,0.861573,-0.615282,-1.267538,-2.201469,-1.787822,-2.275935,-2.009352
5688,-0.615705,0.48562,1.079285,0.821586,-0.071947,-2.010888,-0.096238,-0.098962,-0.233279,0.299756,-0.071771,-0.095624
208,-1.65435,0.48562,-1.546414,-0.744687,-1.17472,-1.532145,0.422806,-0.098962,-0.233279,-0.744033,-0.071771,-2.009352


In [44]:
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline       import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import CountEncoder
from sklearn.svm import SVC
from xgboost                import XGBClassifier
from sklearn.linear_model   import LogisticRegression
from sklearn.ensemble       import RandomForestClassifier
import joblib
from sklearn.metrics import (
    accuracy_score, confusion_matrix, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)



#Funktion zum Erzeugen des passenden Preprocessors
def make_preprocessor(model_name, numeric_features, categorical_features):
    cat_encoder = encoding_strategies[model_name]
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", cat_encoder)
    ])
    return ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ])

# Numerische und kategoriale Features ermitteln (unverändert)
numeric_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Transformer für numerische und kategoriale Daten definieren (unverändert)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),      # fehlende Werte durch Median ersetzen
    ("scaler", StandardScaler())                        # Standardisierung
])

# 2) Encoding-Strategien je Modell festlegen
encoding_strategies = {
    "LogisticRegression":TargetEncoder(handle_unknown="ignore"),
    "RandomForest":      OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    "XGBoost":           OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    "KNN":               CountEncoder(handle_unknown="ignore"),
    "SVM":               TargetEncoder(handle_unknown="ignore")
}

models = [
    ("LogisticRegression", LogisticRegression(random_state=42)),
    ("RandomForest",       RandomForestClassifier(max_features= 3,random_state=42)),
    ("XGBoost",            XGBClassifier(random_state=42)),
    ("KNN",                KNeighborsClassifier()),
    ("SVM",                SVC(probability=True, random_state=42))
]

scoring = {
    "Accuracy":       "accuracy",
    "ROC_AUC":        "roc_auc_ovo_weighted",
    "F1":             "f1_macro"
}

for name, clf in models:
    pipe = Pipeline([
        ("preprocessor", make_preprocessor(name, numeric_features, categorical_features)),
        ("classifier",   clf)
    ])
    pipe.fit(X_train, y_train)

    # 2) Vorhersagen
    y_pred  = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] 
    
    # 3) Metriken berechnen
    acc   = accuracy_score(y_test, y_pred)
    f1    = f1_score(y_test, y_pred)
    roc   = roc_auc_score(y_test, y_proba)      # für Binary nur y_proba[:,1]
    
    # 4) Ausgabe
    print(f"\n=== {name} ===")
    print(f"Accuracy   : {acc:.3f}")
    print(f"F1-Score   : {f1:.3f}")
    print(f"ROC AUC    : {roc:.3f}")

    
    # 5) Modell speichern
    filename = f"model_{name}.pkl"
    joblib.dump(pipe, filename)


=== LogisticRegression ===
Accuracy   : 0.847
F1-Score   : 0.388
ROC AUC    : 0.846

=== RandomForest ===
Accuracy   : 0.856
F1-Score   : 0.447
ROC AUC    : 0.836

=== XGBoost ===
Accuracy   : 0.844
F1-Score   : 0.436
ROC AUC    : 0.831

=== KNN ===
Accuracy   : 0.839
F1-Score   : 0.431
ROC AUC    : 0.767

=== SVM ===
Accuracy   : 0.853
F1-Score   : 0.375
ROC AUC    : 0.773


CV für Hyperparameter und robuste unbiased Schätzung

In [None]:
    cv_results = cross_validate(
        pipe,
        X_train, y_train,
        cv=5,
        scoring=scoring,
        return_train_score=False
    )
    mean_scores = {}
    for metric in scoring:
        m = cv_results[f"test_{metric}"]
        mean_scores[metric] = m.mean()
        print(f"  {metric:15s}: {m.mean():.3f} ± {m.std():.3f}")

    overall_avg = np.mean(list(mean_scores.values()))
    print(f"  {'Overall average':15s}: {overall_avg:.3f}")

## RF Model/Feature Importance

Sparse Group Lasso