# Hypothese 4

## Hypothese 4.1 -- pre post Oversampling

### 3 Fold

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
from itertools import product
import warnings
import time
warnings.filterwarnings('ignore')

# Daten laden
df = pd.read_csv("Data/preprocessed/combined_preprocessed.csv")

# Labels von 5 → 3 Klassen mappen
def map_labels(x):
    if x == 0:
        return 0   # sehr schnell adoptiert
    elif x == 4:
        return 2   # gar nicht adoptiert
    else:
        return 1   # mittlere Geschwindigkeiten (1,2,3)

df['target'] = df['AdoptionSpeed'].map(map_labels)

# Features & Labels trennen
X = df.drop(columns=['AdoptionSpeed', 'target'])
y = df['target'].astype(int)

# Stratified Split: Train / Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Stratified Split: Temp → Valid / Test
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train classes:", np.unique(y_train))
print("Valid classes:", np.unique(y_valid))
print("Test classes:", np.unique(y_test))

# Kategorische Variablen in Kategorie-Typ umwandeln
for df_ in [X_train, X_valid, X_test]:
    for col in df_.select_dtypes(include=["object"]).columns:
        df_[col] = df_[col].astype("category")

# Oversampling nur auf Train
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(f"Original Training Set: {X_train.shape}")
print(f"Oversampled Training Set: {X_train_res.shape}")
print(f"Class distribution after oversampling: {np.bincount(y_train_res)}")

# Kategorische Spalten encodieren
cat_cols = X_train_res.select_dtypes(include=["category"]).columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

if len(cat_cols) > 0:
    X_train_res[cat_cols] = oe.fit_transform(X_train_res[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])

print(f"Final Training Set: {X_train_res.shape}, Validation Set: {X_valid.shape}, Test Set: {X_test.shape}")

# HYPOTHESE 4: max_depth × learning_rate × subsample
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 15],
    'learning_rate': [0.1, 0.07, 0.05, 0.03, 0.01], 
    'subsample': [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
}

total_combinations = len(param_grid['max_depth']) * len(param_grid['learning_rate']) * len(param_grid['subsample'])
print(f"Teste {total_combinations} Parameterkombinationen")

def evaluate_params_native(max_depth, learning_rate, subsample, X_train, y_train):
    """Evaluiert Parameter mit nativem XGBoost (ohne sklearn)"""
    
    # Manual 5-Fold Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        dtrain_fold = xgb.DMatrix(X_tr, label=y_tr)
        dval_fold = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            'objective': 'multi:softmax',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'subsample': subsample,
            'random_state': 42,
            'verbosity': 0
        }
        
        # Train model
        model = xgb.train(params, dtrain_fold, num_boost_round=100, verbose_eval=False)
        
        # Predict and calculate F1
        y_pred = model.predict(dval_fold)
        f1 = f1_score(y_val, y_pred, average='weighted')
        f1_scores.append(f1)
    
    return {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores)
    }

# Kombiniere Training + Validation für Cross-Validation
X_train_val = pd.concat([X_train_res, X_valid])
y_train_val = pd.concat([y_train_res, y_valid])

print("Starte Native XGBoost Grid Search...")
start_time = time.time()

results = []
for i, (max_depth, learning_rate, subsample) in enumerate(
    product(param_grid['max_depth'], 
            param_grid['learning_rate'], 
            param_grid['subsample']), 1):
    
    print(f"[{i:3d}/{total_combinations}] max_depth={max_depth:2d}, lr={learning_rate:.3f}, subsample={subsample:.1f}", end=" ")
    
    result = evaluate_params_native(max_depth, learning_rate, subsample, X_train_val, y_train_val)
    results.append(result)
    
    print(f"→ F1: {result['f1_mean']:.4f}±{result['f1_std']:.3f}")

total_time = time.time() - start_time

# Ergebnisse analysieren
results_df = pd.DataFrame(results)

print(f"\nNative XGBoost Grid Search abgeschlossen in {total_time/60:.1f} Minuten")

# Beste Konfiguration finden
best_result = results_df.loc[results_df['f1_mean'].idxmax()]
print(f"Best parameters: {{'max_depth': {best_result['max_depth']}, 'learning_rate': {best_result['learning_rate']:.3f}, 'subsample': {best_result['subsample']:.1f}}}")
print(f"Best CV score: {best_result['f1_mean']:.4f}")

# Bestes Modell auf Test Set evaluieren (native XGBoost)
dtrain_final = xgb.DMatrix(X_train_val, label=y_train_val)
dtest = xgb.DMatrix(X_test, label=y_test)

best_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': int(best_result['max_depth']),
    'learning_rate': best_result['learning_rate'],
    'subsample': best_result['subsample'],
    'random_state': 42,
    'verbosity': 0
}

best_model = xgb.train(best_params, dtrain_final, num_boost_round=100, verbose_eval=False)
y_test_pred = best_model.predict(dtest)

# Test Metriken
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix (für 3 Klassen)
labels_all = [0, 1, 2]
cm = confusion_matrix(y_test, y_test_pred, labels=labels_all)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Best Model (3 Klassen)")
plt.show()

# Top 10 Konfigurationen
print(f"\nTop 10 Konfigurationen:")
print("="*90)
top_10 = results_df.nlargest(10, 'f1_mean')

for idx, row in top_10.iterrows():
    print(f"F1: {row['f1_mean']:.4f}±{row['f1_std']:.3f} | "
          f"max_depth={int(row['max_depth']):2d}, lr={row['learning_rate']:.3f}, subsample={row['subsample']:.1f}")

# Klassen-Verteilung anzeigen
print(f"\nKlassen-Verteilung im Test Set:")
unique, counts = np.unique(y_test, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"Klasse {cls}: {count} Samples")

Train classes: [0 1 2]
Valid classes: [0 1 2]
Test classes: [0 1 2]
Original Training Set: (13275, 22)
Oversampled Training Set: (21810, 22)
Class distribution after oversampling: [7270 7270 7270]
Final Training Set: (21810, 22), Validation Set: (2845, 22), Test Set: (2845, 22)
Teste 180 Parameterkombinationen
Starte Native XGBoost Grid Search...
[  1/180] max_depth= 3, lr=0.100, subsample=1.0 → F1: 0.5517±0.012
[  2/180] max_depth= 3, lr=0.100, subsample=0.9 → F1: 0.5539±0.010
[  3/180] max_depth= 3, lr=0.100, subsample=0.8 → F1: 0.5555±0.010
[  4/180] max_depth= 3, lr=0.100, subsample=0.7 → F1: 0.5573±0.010
[  5/180] max_depth= 3, lr=0.100, subsample=0.6 → F1: 0.5556±0.010
[  6/180] max_depth= 3, lr=0.100, subsample=0.5 → F1: 0.5577±0.012
[  7/180] max_depth= 3, lr=0.070, subsample=1.0 → F1: 0.5396±0.013
[  8/180] max_depth= 3, lr=0.070, subsample=0.9 → F1: 0.5421±0.012
[  9/180] max_depth= 3, lr=0.070, subsample=0.8 → F1: 0.5404±0.012
[ 10/180] max_depth= 3, lr=0.070, subsample=0.7 

## Hypothese 4.2 -- post Oversampling

### 3 Fold

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from itertools import product
import warnings
import time
warnings.filterwarnings('ignore')

# Daten laden (bereits preprocessed)
print("Lade vorverarbeitete Daten...")

X_train_res = pd.read_csv("Data/preprocessed/X_train_res.csv")
y_train_res = pd.read_csv("Data/preprocessed/y_train_res.csv").iloc[:, 0]

X_valid = pd.read_csv("Data/preprocessed/X_valid_enc.csv")
y_valid = pd.read_csv("Data/preprocessed/y_valid.csv").iloc[:, 0]

X_test = pd.read_csv("Data/preprocessed/X_test_enc.csv")
y_test = pd.read_csv("Data/preprocessed/y_test.csv").iloc[:, 0]

print(f"Training Set (oversampled): {X_train_res.shape}")
print(f"Validation Set: {X_valid.shape}")
print(f"Test Set: {X_test.shape}")

print("Train classes:", np.unique(y_train_res))
print("Valid classes:", np.unique(y_valid))
print("Test classes:", np.unique(y_test))
print(f"Class distribution in training: {np.bincount(y_train_res)}")

# HYPOTHESE 4: max_depth × learning_rate × subsample
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 15],
    'learning_rate': [0.1, 0.07, 0.05, 0.03, 0.01], 
    'subsample': [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
}

total_combinations = len(param_grid['max_depth']) * len(param_grid['learning_rate']) * len(param_grid['subsample'])
print(f"Teste {total_combinations} Parameterkombinationen")

def evaluate_params_native(max_depth, learning_rate, subsample, X_train, y_train):
    """Evaluiert Parameter mit nativem XGBoost (ohne sklearn)"""
    
    # Manual 3-Fold Cross Validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        dtrain_fold = xgb.DMatrix(X_tr, label=y_tr)
        dval_fold = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            'objective': 'multi:softmax',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'subsample': subsample,
            'random_state': 42,
            'verbosity': 0
        }
        
        # Train model
        model = xgb.train(params, dtrain_fold, num_boost_round=100, verbose_eval=False)
        
        # Predict and calculate F1
        y_pred = model.predict(dval_fold)
        f1 = f1_score(y_val, y_pred, average='weighted')
        f1_scores.append(f1)
    
    return {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores)
    }

# Kombiniere Training + Validation für Cross-Validation
X_train_val = pd.concat([X_train_res, X_valid])
y_train_val = pd.concat([y_train_res, y_valid])

print("Starte Native XGBoost Grid Search...")
start_time = time.time()

results = []
for i, (max_depth, learning_rate, subsample) in enumerate(
    product(param_grid['max_depth'], 
            param_grid['learning_rate'], 
            param_grid['subsample']), 1):
    
    print(f"[{i:3d}/{total_combinations}] max_depth={max_depth:2d}, lr={learning_rate:.3f}, subsample={subsample:.1f}", end=" ")
    
    result = evaluate_params_native(max_depth, learning_rate, subsample, X_train_val, y_train_val)
    results.append(result)
    
    print(f"→ F1: {result['f1_mean']:.4f}±{result['f1_std']:.3f}")

total_time = time.time() - start_time

# Ergebnisse analysieren
results_df = pd.DataFrame(results)

print(f"\nNative XGBoost Grid Search abgeschlossen in {total_time/60:.1f} Minuten")

# Beste Konfiguration finden
best_result = results_df.loc[results_df['f1_mean'].idxmax()]
print(f"Best parameters: {{'max_depth': {best_result['max_depth']}, 'learning_rate': {best_result['learning_rate']:.3f}, 'subsample': {best_result['subsample']:.1f}}}")
print(f"Best CV score: {best_result['f1_mean']:.4f}")

# Bestes Modell auf Test Set evaluieren (native XGBoost)
dtrain_final = xgb.DMatrix(X_train_val, label=y_train_val)
dtest = xgb.DMatrix(X_test, label=y_test)

best_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': int(best_result['max_depth']),
    'learning_rate': best_result['learning_rate'],
    'subsample': best_result['subsample'],
    'random_state': 42,
    'verbosity': 0
}

best_model = xgb.train(best_params, dtrain_final, num_boost_round=100, verbose_eval=False)
y_test_pred = best_model.predict(dtest)

# Test Metriken
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix (für 3 Klassen)
labels_all = [0, 1, 2]
cm = confusion_matrix(y_test, y_test_pred, labels=labels_all)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Best Model (3 Klassen)")
plt.show()

# Top 10 Konfigurationen
print(f"\nTop 10 Konfigurationen:")
print("="*90)
top_10 = results_df.nlargest(10, 'f1_mean')

for idx, row in top_10.iterrows():
    print(f"F1: {row['f1_mean']:.4f}±{row['f1_std']:.3f} | "
          f"max_depth={int(row['max_depth']):2d}, lr={row['learning_rate']:.3f}, subsample={row['subsample']:.1f}")

# Klassen-Verteilung anzeigen
print(f"\nKlassen-Verteilung im Test Set:")
unique, counts = np.unique(y_test, return_counts=True)
for cls, count in zip(unique, counts):
    class_names = ["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"]
    print(f"Klasse {cls} ({class_names[cls]}): {count} Samples")

### 5 Fold

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from itertools import product
import warnings
import time
warnings.filterwarnings('ignore')

# Daten laden (bereits preprocessed)
print("Lade vorverarbeitete Daten...")

X_train_res = pd.read_csv("Data/preprocessed/X_train_res.csv")
y_train_res = pd.read_csv("Data/preprocessed/y_train_res.csv").iloc[:, 0]

X_valid = pd.read_csv("Data/preprocessed/X_valid_enc.csv")
y_valid = pd.read_csv("Data/preprocessed/y_valid.csv").iloc[:, 0]

X_test = pd.read_csv("Data/preprocessed/X_test_enc.csv")
y_test = pd.read_csv("Data/preprocessed/y_test.csv").iloc[:, 0]

print(f"Training Set (oversampled): {X_train_res.shape}")
print(f"Validation Set: {X_valid.shape}")
print(f"Test Set: {X_test.shape}")

print("Train classes:", np.unique(y_train_res))
print("Valid classes:", np.unique(y_valid))
print("Test classes:", np.unique(y_test))
print(f"Class distribution in training: {np.bincount(y_train_res)}")

# HYPOTHESE 4: max_depth × learning_rate × subsample
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 15],
    'learning_rate': [0.1, 0.07, 0.05, 0.03, 0.01], 
    'subsample': [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
}

total_combinations = len(param_grid['max_depth']) * len(param_grid['learning_rate']) * len(param_grid['subsample'])
print(f"Teste {total_combinations} Parameterkombinationen")

def evaluate_params_native(max_depth, learning_rate, subsample, X_train, y_train):
    """Evaluiert Parameter mit nativem XGBoost (ohne sklearn)"""
    
    # Manual 3-Fold Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        dtrain_fold = xgb.DMatrix(X_tr, label=y_tr)
        dval_fold = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            'objective': 'multi:softmax',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'subsample': subsample,
            'random_state': 42,
            'verbosity': 0
        }
        
        # Train model
        model = xgb.train(params, dtrain_fold, num_boost_round=100, verbose_eval=False)
        
        # Predict and calculate F1
        y_pred = model.predict(dval_fold)
        f1 = f1_score(y_val, y_pred, average='weighted')
        f1_scores.append(f1)
    
    return {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores)
    }

# Kombiniere Training + Validation für Cross-Validation
X_train_val = pd.concat([X_train_res, X_valid])
y_train_val = pd.concat([y_train_res, y_valid])

print("Starte Native XGBoost Grid Search...")
start_time = time.time()

results = []
for i, (max_depth, learning_rate, subsample) in enumerate(
    product(param_grid['max_depth'], 
            param_grid['learning_rate'], 
            param_grid['subsample']), 1):
    
    print(f"[{i:3d}/{total_combinations}] max_depth={max_depth:2d}, lr={learning_rate:.3f}, subsample={subsample:.1f}", end=" ")
    
    result = evaluate_params_native(max_depth, learning_rate, subsample, X_train_val, y_train_val)
    results.append(result)
    
    print(f"→ F1: {result['f1_mean']:.4f}±{result['f1_std']:.3f}")

total_time = time.time() - start_time

# Ergebnisse analysieren
results_df = pd.DataFrame(results)

print(f"\nNative XGBoost Grid Search abgeschlossen in {total_time/60:.1f} Minuten")

# Beste Konfiguration finden
best_result = results_df.loc[results_df['f1_mean'].idxmax()]
print(f"Best parameters: {{'max_depth': {best_result['max_depth']}, 'learning_rate': {best_result['learning_rate']:.3f}, 'subsample': {best_result['subsample']:.1f}}}")
print(f"Best CV score: {best_result['f1_mean']:.4f}")

# Bestes Modell auf Test Set evaluieren (native XGBoost)
dtrain_final = xgb.DMatrix(X_train_val, label=y_train_val)
dtest = xgb.DMatrix(X_test, label=y_test)

best_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': int(best_result['max_depth']),
    'learning_rate': best_result['learning_rate'],
    'subsample': best_result['subsample'],
    'random_state': 42,
    'verbosity': 0
}

best_model = xgb.train(best_params, dtrain_final, num_boost_round=100, verbose_eval=False)
y_test_pred = best_model.predict(dtest)

# Test Metriken
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix (für 3 Klassen)
labels_all = [0, 1, 2]
cm = confusion_matrix(y_test, y_test_pred, labels=labels_all)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Best Model (3 Klassen)")
plt.show()

# Top 10 Konfigurationen
print(f"\nTop 10 Konfigurationen:")
print("="*90)
top_10 = results_df.nlargest(10, 'f1_mean')

for idx, row in top_10.iterrows():
    print(f"F1: {row['f1_mean']:.4f}±{row['f1_std']:.3f} | "
          f"max_depth={int(row['max_depth']):2d}, lr={row['learning_rate']:.3f}, subsample={row['subsample']:.1f}")

# Klassen-Verteilung anzeigen
print(f"\nKlassen-Verteilung im Test Set:")
unique, counts = np.unique(y_test, return_counts=True)
for cls, count in zip(unique, counts):
    class_names = ["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"]
    print(f"Klasse {cls} ({class_names[cls]}): {count} Samples")

### 10 Fold

# Hypothese 5

## Hypothese 5 pre oversamling

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
from itertools import product
import warnings
import time
warnings.filterwarnings('ignore')

# Daten laden
df = pd.read_csv("Data/preprocessed/combined_preprocessed.csv")

# Labels von 5 → 3 Klassen mappen
def map_labels(x):
    if x == 0:
        return 0   # sehr schnell adoptiert
    elif x == 4:
        return 2   # gar nicht adoptiert
    else:
        return 1   # mittlere Geschwindigkeiten (1,2,3)

df['target'] = df['AdoptionSpeed'].map(map_labels)

# Features & Labels trennen
X = df.drop(columns=['AdoptionSpeed', 'target'])
y = df['target'].astype(int)

# Stratified Split: Train / Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Stratified Split: Temp → Valid / Test
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train classes:", np.unique(y_train))
print("Valid classes:", np.unique(y_valid))
print("Test classes:", np.unique(y_test))

# Kategorische Variablen in Kategorie-Typ umwandeln
for df_ in [X_train, X_valid, X_test]:
    for col in df_.select_dtypes(include=["object"]).columns:
        df_[col] = df_[col].astype("category")

# Oversampling nur auf Train
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(f"Original Training Set: {X_train.shape}")
print(f"Oversampled Training Set: {X_train_res.shape}")
print(f"Class distribution after oversampling: {np.bincount(y_train_res)}")

# Kategorische Spalten encodieren
cat_cols = X_train_res.select_dtypes(include=["category"]).columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

if len(cat_cols) > 0:
    X_train_res[cat_cols] = oe.fit_transform(X_train_res[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])

print(f"Final Training Set: {X_train_res.shape}, Validation Set: {X_valid.shape}, Test Set: {X_test.shape}")

# HYPOTHESE 5: n_estimators × max_depth × subsample
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
    'subsample': [1.0, 0.8, 0.6, 0.4]
}

total_combinations = len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['subsample'])
print(f"Teste {total_combinations} Parameterkombinationen")

def evaluate_with_native_xgb(n_estimators, max_depth, subsample, X_data, y_data):
    """Native XGBoost ohne sklearn dependencies"""
    
    # Manual 3-Fold Cross Validation
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in kfold.split(X_data, y_data):
        X_tr = X_data.iloc[train_idx]
        X_val = X_data.iloc[val_idx] 
        y_tr = y_data.iloc[train_idx]
        y_val = y_data.iloc[val_idx]
        
        # Native XGBoost Training
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            'objective': 'multi:softmax',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'subsample': subsample,
            'random_state': 42,
            'verbosity': 0
        }
        
        model = xgb.train(params, dtrain, num_boost_round=n_estimators, verbose_eval=False)
        predictions = model.predict(dval)
        
        f1 = f1_score(y_val, predictions, average='weighted')
        f1_scores.append(f1)
    
    return {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'subsample': subsample,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores)
    }

# Kombiniere Daten
X_combined = pd.concat([X_train_res, X_valid])
y_combined = pd.concat([y_train_res, y_valid])

print("Starte Native XGBoost Grid Search...")
start_time = time.time()

all_results = []
for i, (n_est, max_d, sub) in enumerate(
    product(param_grid['n_estimators'], 
            param_grid['max_depth'], 
            param_grid['subsample']), 1):
    
    print(f"[{i:4d}/{total_combinations}] n_est={n_est:4d}, max_depth={max_d:2d}, subsample={sub:.1f}", end=" ")
    
    result = evaluate_with_native_xgb(n_est, max_d, sub, X_combined, y_combined)
    all_results.append(result)
    
    print(f"-> F1: {result['f1_mean']:.4f}±{result['f1_std']:.3f}")

total_time = time.time() - start_time
print(f"\nNative Grid Search abgeschlossen in {total_time/60:.1f} Minuten")

# Ergebnisse analysieren  
results_df = pd.DataFrame(all_results)
best_config = results_df.loc[results_df['f1_mean'].idxmax()]

print(f"Best parameters: {{'n_estimators': {best_config['n_estimators']}, 'max_depth': {best_config['max_depth']}, 'subsample': {best_config['subsample']:.1f}}}")
print(f"Best CV score: {best_config['f1_mean']:.4f}")

# Finales Modell auf Test Set
dtrain_final = xgb.DMatrix(X_combined, label=y_combined)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': int(best_config['max_depth']),
    'subsample': best_config['subsample'],
    'random_state': 42,
    'verbosity': 0
}

final_model = xgb.train(final_params, dtrain_final, 
                       num_boost_round=int(best_config['n_estimators']), 
                       verbose_eval=False)

y_final_pred = final_model.predict(dtest_final)

# Test Metriken
test_acc = accuracy_score(y_test, y_final_pred)
test_f1 = f1_score(y_test, y_final_pred, average='weighted')

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_final_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_final_pred, labels=[0, 1, 2, 3, 4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Klasse 0", "Klasse 1", "Klasse 2", "Klasse 3", "Klasse 4"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Native XGBoost Best Model (5 Klassen)")
plt.show()

# Top 10
print(f"\nTop 10 Konfigurationen:")
print("="*100)
top_10_configs = results_df.nlargest(10, 'f1_mean')

for idx, row in top_10_configs.iterrows():
    print(f"F1: {row['f1_mean']:.4f}±{row['f1_std']:.3f} | "
          f"n_estimators={int(row['n_estimators']):4d}, max_depth={int(row['max_depth']):2d}, subsample={row['subsample']:.1f}")

# Klassen-Verteilung
print(f"\nKlassen-Verteilung im Test Set:")
unique, counts = np.unique(y_test, return_counts=True)
class_names = ["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"]
for cls, count in zip(unique, counts):
    print(f"Klasse {cls} ({class_names[cls]}): {count} Samples")

### 3 Fold

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
from itertools import product
import warnings
import time
warnings.filterwarnings('ignore')

# Daten laden
df = pd.read_csv("Data/preprocessed/combined_preprocessed.csv")

# Labels von 5 → 3 Klassen mappen
def map_labels(x):
    if x == 0:
        return 0   # sehr schnell adoptiert
    elif x == 4:
        return 2   # gar nicht adoptiert
    else:
        return 1   # mittlere Geschwindigkeiten (1,2,3)

df['target'] = df['AdoptionSpeed'].map(map_labels)

# Features & Labels trennen
X = df.drop(columns=['AdoptionSpeed', 'target'])
y = df['target'].astype(int)

# Stratified Split: Train / Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Stratified Split: Temp → Valid / Test
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train classes:", np.unique(y_train))
print("Valid classes:", np.unique(y_valid))
print("Test classes:", np.unique(y_test))

# Kategorische Variablen in Kategorie-Typ umwandeln
for df_ in [X_train, X_valid, X_test]:
    for col in df_.select_dtypes(include=["object"]).columns:
        df_[col] = df_[col].astype("category")

# Oversampling nur auf Train
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(f"Original Training Set: {X_train.shape}")
print(f"Oversampled Training Set: {X_train_res.shape}")
print(f"Class distribution after oversampling: {np.bincount(y_train_res)}")

# Kategorische Spalten encodieren
cat_cols = X_train_res.select_dtypes(include=["category"]).columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

if len(cat_cols) > 0:
    X_train_res[cat_cols] = oe.fit_transform(X_train_res[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])

print(f"Final Training Set: {X_train_res.shape}, Validation Set: {X_valid.shape}, Test Set: {X_test.shape}")

# HYPOTHESE 5: n_estimators × max_depth × subsample
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 600, 700],
    'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'subsample': [1.0, 0.8, 0.6, 0.4]
}

total_combinations = len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['subsample'])
print(f"Teste {total_combinations} Parameterkombinationen")

def evaluate_with_native_xgb(n_estimators, max_depth, subsample, X_data, y_data):
    """Native XGBoost ohne sklearn dependencies"""
    
    # Manual 3-Fold Cross Validation
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in kfold.split(X_data, y_data):
        X_tr = X_data.iloc[train_idx]
        X_val = X_data.iloc[val_idx] 
        y_tr = y_data.iloc[train_idx]
        y_val = y_data.iloc[val_idx]
        
        # Native XGBoost Training
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        params = {
            'objective': 'multi:softmax',
            'num_class': 3,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'subsample': subsample,
            'random_state': 42,
            'verbosity': 0
        }
        
        model = xgb.train(params, dtrain, num_boost_round=n_estimators, verbose_eval=False)
        predictions = model.predict(dval)
        
        f1 = f1_score(y_val, predictions, average='weighted')
        f1_scores.append(f1)
    
    return {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'subsample': subsample,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores)
    }

# Kombiniere Daten
X_combined = pd.concat([X_train_res, X_valid])
y_combined = pd.concat([y_train_res, y_valid])

print("Starte Native XGBoost Grid Search...")
start_time = time.time()

all_results = []
for i, (n_est, max_d, sub) in enumerate(
    product(param_grid['n_estimators'], 
            param_grid['max_depth'], 
            param_grid['subsample']), 1):
    
    print(f"[{i:4d}/{total_combinations}] n_est={n_est:4d}, max_depth={max_d:2d}, subsample={sub:.1f}", end=" ")
    
    result = evaluate_with_native_xgb(n_est, max_d, sub, X_combined, y_combined)
    all_results.append(result)
    
    print(f"-> F1: {result['f1_mean']:.4f}±{result['f1_std']:.3f}")

total_time = time.time() - start_time
print(f"\nNative Grid Search abgeschlossen in {total_time/60:.1f} Minuten")

# Ergebnisse analysieren  
results_df = pd.DataFrame(all_results)
best_config = results_df.loc[results_df['f1_mean'].idxmax()]

print(f"Best parameters: {{'n_estimators': {best_config['n_estimators']}, 'max_depth': {best_config['max_depth']}, 'subsample': {best_config['subsample']:.1f}}}")
print(f"Best CV score: {best_config['f1_mean']:.4f}")

# Finales Modell auf Test Set
dtrain_final = xgb.DMatrix(X_combined, label=y_combined)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'max_depth': int(best_config['max_depth']),
    'subsample': best_config['subsample'],
    'random_state': 42,
    'verbosity': 0
}

final_model = xgb.train(final_params, dtrain_final, 
                       num_boost_round=int(best_config['n_estimators']), 
                       verbose_eval=False)

y_final_pred = final_model.predict(dtest_final)

# Test Metriken
test_acc = accuracy_score(y_test, y_final_pred)
test_f1 = f1_score(y_test, y_final_pred, average='weighted')

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_final_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_final_pred, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Native XGBoost Best Model")
plt.show()

# Top 10
print(f"\nTop 10 Konfigurationen:")
print("="*100)
top_10_configs = results_df.nlargest(10, 'f1_mean')

for idx, row in top_10_configs.iterrows():
    print(f"F1: {row['f1_mean']:.4f}±{row['f1_std']:.3f} | "
          f"n_estimators={int(row['n_estimators']):4d}, max_depth={int(row['max_depth']):2d}, subsample={row['subsample']:.1f}")

# Klassen-Verteilung
print(f"\nKlassen-Verteilung im Test Set:")
unique, counts = np.unique(y_test, return_counts=True)
class_names = ["Sehr schnell (0)", "Mittel (1-3)", "Gar nicht (4)"]
for cls, count in zip(unique, counts):
    print(f"Klasse {cls} ({class_names[cls]}): {count} Samples")

## Hypothese 5 post oversamling

### 3 Fold

In [None]:
#