# Classification

## Method 1: SKfold

In [None]:
# Option 1: with resampling

models = {
    'Logistic Regression': LogisticRegression(penalty='l2', max_iter=10000, random_state=0),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
    'SVM': SVC(probability=True, random_state=0),
    'SVM2': SVC(kernel='sigmoid', probability=True, random_state=0),
    'LightGBM': LGBMClassifier(random_state=0)
}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=14)
results = []
fold_counter = 0

for model_name, model in models.items():
    train_aucs = []
    val_aucs = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_t, y_t = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        X_t_res, y_t_res = over.fit_resample(X_t, y_t)

        model.fit(X_t_res, y_t_res)

        if hasattr(model, "predict_proba"):
            y_t_pred = model.predict_proba(X_t_res)[:, 1]
            y_val_pred = model.predict_proba(X_val)[:, 1]
        else:
            y_t_pred = model.decision_function(X_t_res)
            y_val_pred = model.decision_function(X_val)

        train_auc = roc_auc_score(y_t_res, y_t_pred)
        val_auc = roc_auc_score(y_val, y_val_pred)

        train_aucs.append(train_auc)
        val_aucs.append(val_auc)

        fold_counter += 1
        print(f"{model_name} - Fold {fold_counter}: Train AUC = {train_auc:.4f}, Val AUC = {val_auc:.4f}")

    results.append({
        'Model': model_name,
        'Mean Train AUC': np.mean(train_aucs),
        'Mean Validation AUC': np.mean(val_aucs)
    })

X_train_res, y_train_res = over.fit_resample(X_train, y_train)

# After choosing the best model - SVM (RBF kernel)
final_svm = SVC(probability=True, random_state=0)
final_svm.fit(X_train_res, y_train_res)

y_train_probs = final_svm.predict_proba(X_train_res)[:, 1]
train_auc = roc_auc_score(y_train_res, y_train_probs)

y_test_probs = final_svm.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_probs)

# Option 2: with/ no class weights

# 2.1 With class weights
models = {
    'Logistic Regression': LogisticRegression(penalty='l2', class_weight='balanced', max_iter=10000, random_state=0),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=0),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=0),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=0),
    'SVM2': SVC(class_weight='balanced', kernel='sigmoid', probability=True, random_state=0),
    'LightGBM': LGBMClassifier(is_unbalance=True, random_state=0)
}


# 2.2 No class weights

models = {
    'Logistic Regression': LogisticRegression(penalty='l2', max_iter=10000, random_state=0),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
    'SVM': SVC(probability=True, random_state=0),
    'SVM2': SVC(kernel='sigmoid', probability=True, random_state=0),
    'LightGBM': LGBMClassifier(random_state=0)
}

results = []

for repeat in range(3):
    print(f"\nRepeat {repeat + 1}/3")
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=repeat)

    for model_name, model in models.items():
        train_auc_scores = []
        val_auc_scores = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            X_t, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_t, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model.fit(X_t, y_t)
            y_t_prob = model.predict_proba(X_t)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_t)
            y_val_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)

            train_auc = roc_auc_score(y_t, y_t_prob)
            val_auc = roc_auc_score(y_val, y_val_prob)

            train_auc_scores.append(train_auc)
            val_auc_scores.append(val_auc)

            print(f"  {model_name} - Fold {fold + 1}: Train AUC = {train_auc:.4f}, Val AUC = {val_auc:.4f}")

        results.append({
            'Repeat': repeat + 1,
            'Model': model_name,
            'Mean Train AUC': np.mean(train_auc_scores),
            'Mean Validation AUC': np.mean(val_auc_scores)
        })


# After choosing the best model

# With class weights - SVM was the best
final_svm = SVC(class_weight='balanced', probability=True, random_state=0)
final_svm.fit(X_train, y_train)

y_train_probs = final_svm.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, y_train_probs)

y_test_probs = final_svm.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_probs)

# Without class weights - Logistic Regression was the best
final_lr = LogisticRegression(penalty='l2', max_iter=10000, random_state=0)
final_lr.fit(X_train, y_train)

y_train_probs = final_lr.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, y_train_probs)

y_test_probs = final_lr.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_probs)

## Method 2: Train/Dev/Test

In [None]:
# Option 1: the models without class weights
lr = LogisticRegression(penalty='l2',max_iter=10000, random_state=0)
tree = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
svm = SVC(probability=True, random_state=0)
svm2= SVC(kernel='sigmoid', probability=True, random_state=0)
lgbm = LGBMClassifier(random_state=0)

# Option 2: the models with class weights
lr = LogisticRegression(penalty='l2', class_weight='balanced', max_iter=10000, random_state=0)
tree = DecisionTreeClassifier(class_weight='balanced', random_state=0)
rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=0)
svm = SVC(class_weight='balanced', probability=True, random_state=0)
svm2= SVC(class_weight='balanced', kernel='sigmoid', probability=True, random_state=0)
lgbm = LGBMClassifier(is_unbalance=True, random_state=0)

# Option 1 also fits the resampling method

# Fit all models
lr.fit(x_train, y_train)
tree.fit(x_train, y_train)
rf.fit(x_train, y_train)
svm.fit(x_train, y_train)
svm2.fit(x_train, y_train)
lgbm.fit(x_train, y_train)

# Predict
y_proba_lr_train = lr.predict_proba(x_train)
y_proba_lr_dev = lr.predict_proba(x_dev)

y_proba_tree_train = tree.predict_proba(x_train)
y_proba_tree_dev = tree.predict_proba(x_dev)

y_proba_rf_train = rf.predict_proba(x_train)
y_proba_rf_dev = rf.predict_proba(x_dev)

y_proba_svm_train = svm.predict_proba(x_train)
y_proba_svm_dev = svm.predict_proba(x_dev)

y_proba_svm2_train = svm2.predict_proba(x_train)
y_proba_svm2_dev = svm2.predict_proba(x_dev)

y_proba_lgbm_train = lgbm.predict_proba(x_train)
y_proba_lgbm_dev = lgbm.predict_proba(x_dev)

In [None]:
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'SVM2', 'LightGBM']

train_metrics = [
    classificationMetrics(y_train, y_proba_lr_train),
    classificationMetrics(y_train, y_proba_tree_train),
    classificationMetrics(y_train, y_proba_rf_train),
    classificationMetrics(y_train, y_proba_svm_train),
    classificationMetrics(y_train, y_proba_svm2_train),
    classificationMetrics(y_train, y_proba_lgbm_train)
]

dev_metrics = [
    classificationMetrics(y_dev, y_proba_lr_dev),
    classificationMetrics(y_dev, y_proba_tree_dev),
    classificationMetrics(y_dev, y_proba_rf_dev),
    classificationMetrics(y_dev, y_proba_svm_dev),
    classificationMetrics(y_dev, y_proba_svm2_dev),
    classificationMetrics(y_dev, y_proba_lgbm_dev)
]

table = {
    'Model': models,
    'Train AUC':       [m['AUC'] for m in train_metrics],
    'Dev AUC':         [m['AUC'] for m in dev_metrics],
    'Train F1':        [m['F1'] for m in train_metrics],
    'Dev F1':          [m['F1'] for m in dev_metrics],
    'Train Recall':    [m['Recall'] for m in train_metrics],
    'Dev Recall':      [m['Recall'] for m in dev_metrics],
    'Train Precision': [m['Precision'] for m in train_metrics],
    'Dev Precision':   [m['Precision'] for m in dev_metrics],
}

In [None]:
# Choosing AUC
model_table = pd.DataFrame(table, columns =["Model" , "Train AUC" , "Dev AUC"])
model_table['Train/Dev Ratio'] = model_table["Train AUC"] / model_table["Dev AUC"]

In [None]:
# Continuing with the best method and model - SVM2 (SVM with sigmoid kernel) + resampling
y_proba_svm2_test = svm2.predict_proba(x_test)
train_metrics_svm2 = classificationMetrics(y_train, y_proba_svm2_train)
dev_metrics_svm2 = classificationMetrics(y_dev, y_proba_svm2_dev)
test_metrics_svm2 = classificationMetrics(y_test, y_proba_svm2_test)

# Hyperparameter tuning with GridSearchCV
param_grid = [
    {
        'kernel': ['sigmoid'],
        'C': [0.01, 0.1, 1, 2, 3],
        'gamma': [1e-3, 1e-2, 0.1, 1],
        'coef0': [0, 0.5, 1, 2]
    }
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
svm = SVC(probability=True, random_state=0)
grid_search = GridSearchCV(svm, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
print("Best parameters:", grid_search.best_params_)
best_svm = grid_search.best_estimator_

### Feature Importance

In [None]:
result = permutation_importance(
    best_svm,
    x_train,
    y_train,
    n_repeats=10,
    scoring='roc_auc',
    random_state=0
)

importances_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance Mean': result.importances_mean,
}).sort_values(by='Importance Mean', ascending=False)

# Top 15 features
top_n = 15
top_features = importances_df.head(top_n)
means = top_features['Importance Mean'] * 100

### Confusion Matrix & AUC-ROC Curve

In [None]:
# Confusion matrix
y_pred = (final_y_proba_test >= 0.45).astype(int)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])

# AUC-ROC curve
y_proba = final_y_proba_test
auc_roc = roc_auc_score(y_test, y_proba)

# Regression

In [None]:
# Linear Regression
lr_reg = LinearRegression()
lr_reg.fit(X_train_reg, y_train_reg, sample_weight=weights_train)
yhat_train_lr = lr_reg.predict(X_train_reg)
yhat_dev_lr = lr_reg.predict(X_dev_reg)
yhat_test_lr = lr_reg.predict(X_test_reg)

# Ridge Regression
ridge_reg = Ridge()
ridge_reg.fit(X_train_reg, y_train_reg, sample_weight=weights_train)
yhat_train_ridge = ridge_reg.predict(X_train_reg)
yhat_dev_ridge = ridge_reg.predict(X_dev_reg)
yhat_test_ridge = ridge_reg.predict(X_test_reg)

# Random Forest
rf_reg = RandomForestRegressor(n_estimators=200, max_depth=6, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg, sample_weight=weights_train)
yhat_train_rf = rf_reg.predict(X_train_reg)
yhat_dev_rf = rf_reg.predict(X_dev_reg)
yhat_test_rf = rf_reg.predict(X_test_reg)

# Gradient Boosting
gb_reg = GradientBoostingRegressor(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=42)
gb_reg.fit(X_train_reg, y_train_reg, sample_weight=weights_train)
yhat_train_gb = gb_reg.predict(X_train_reg)
yhat_dev_gb = gb_reg.predict(X_dev_reg)
yhat_test_gb = gb_reg.predict(X_test_reg)

# The chosen metric results
mae_results = pd.DataFrame([
    {
        'Model': 'Linear Regression',
        'Train MAE': round(mean_absolute_error(y_train_reg, yhat_train_lr), 3),
        'Dev MAE': round(mean_absolute_error(y_dev_reg, yhat_dev_lr), 3),
        'Test MAE': round(mean_absolute_error(y_test_reg, yhat_test_lr), 3)
    },
    {
        'Model': 'Ridge',
        'Train MAE': round(mean_absolute_error(y_train_reg, yhat_train_ridge), 3),
        'Dev MAE': round(mean_absolute_error(y_dev_reg, yhat_dev_ridge), 3),
        'Test MAE': round(mean_absolute_error(y_test_reg, yhat_test_ridge), 3)
    },
    {
        'Model': 'Random Forest',
        'Train MAE': round(mean_absolute_error(y_train_reg, yhat_train_rf), 3),
        'Dev MAE': round(mean_absolute_error(y_dev_reg, yhat_dev_rf), 3),
        'Test MAE': round(mean_absolute_error(y_test_reg, yhat_test_rf), 3)
    },
    {
        'Model': 'Gradient Boosting',
        'Train MAE': round(mean_absolute_error(y_train_reg, yhat_train_gb), 3),
        'Dev MAE': round(mean_absolute_error(y_dev_reg, yhat_dev_gb), 3),
        'Test MAE': round(mean_absolute_error(y_test_reg, yhat_test_gb), 3)
    }

In [None]:
# Hyperparameter Tuning for the best results


# 1. Random Forest

param_dist = {
    'n_estimators': randint(200, 600),
    'max_depth': [10, 12, 14, 16, 18],
    'min_samples_split': randint(5, 20),
    'min_samples_leaf': randint(3, 10),
    'max_features': ['sqrt', 'log2', 0.3, 0.5, None]
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    return_train_score=True
)

random_search.fit(X_train_reg, y_train_reg, sample_weight=weights_train)

best_params_rf = random_search.best_params_
best_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_rf.fit(X_train_reg, y_train_reg, sample_weight=weights_train)

yhat_train_rf = best_rf.predict(X_train_reg)
yhat_dev_rf = best_rf.predict(X_dev_reg)
yhat_test_rf = best_rf.predict(X_test_reg)

mae_train = mean_absolute_error(y_train_reg, yhat_train_rf)
mae_dev = mean_absolute_error(y_dev_reg, yhat_dev_rf)
mae_test = mean_absolute_error(y_test_reg, yhat_test_rf)


rf_random_results = pd.DataFrame([
    {
        'Model': 'Random Search RF',
        'Best Params': str(best_params_rf),
        'Train MAE': round(mae_train, 3),
        'Dev MAE': round(mae_dev, 3),
        'Test MAE': round(mae_test, 3)
    }
])


# 2. Gradient Boosting

param_dist_gb = {
    'n_estimators': randint(200, 400),
    'learning_rate': uniform(0.05, 0.07),
    'max_depth': [3, 4, 5],
    'min_samples_split': randint(5, 20),
    'min_samples_leaf': randint(5, 15),
    'subsample': [0.7, 0.8, 0.9],
    'max_features': ['sqrt', 'log2', None]
}


random_search_gb = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=0),
    param_distributions=param_dist_gb,
    n_iter=30,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    return_train_score=True
)

random_search_gb.fit(X_train_reg, y_train_reg, sample_weight=weights_train)
best_params_gb = random_search_gb.best_params_
best_gb = GradientBoostingRegressor(**best_params_gb, random_state=42)
best_gb.fit(X_train_reg, y_train_reg, sample_weight=weights_train)

yhat_train_gb = best_gb.predict(X_train_reg)
yhat_dev_gb = best_gb.predict(X_dev_reg)
yhat_test_gb = best_gb.predict(X_test_reg)

mae_train_gb = mean_absolute_error(y_train_reg, yhat_train_gb)
mae_dev_gb = mean_absolute_error(y_dev_reg, yhat_dev_gb)
mae_test_gb = mean_absolute_error(y_test_reg, yhat_test_gb)

# Save to DataFrame
gb_random_results = pd.DataFrame([{
    'Model': 'Random Search GB',
    'Best Params': str(best_params_gb),
    'Train MAE': round(mae_train_gb, 3),
    'Dev MAE': round(mae_dev_gb, 3),
    'Test MAE': round(mae_test_gb, 3)
}])

In [None]:
# Feature Importance - Random Forest was chosen as the best model
importances = best_rf.feature_importances_
feature_names = X_train_reg.columns

feat_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

top_n = 15
top_feat = feat_importance_df.head(top_n)

# SHAP Feature Importance
explainer = shap.TreeExplainer(best_rf)
shap_values = explainer(X_test_reg)
shap.summary_plot(shap_values, X_test_reg, plot_type="bar")
shap.summary_plot(shap_values, X_test_reg)
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.feature_names)
top_combined = mean_signed.sort_values(ascending=False).head(top_n)
features = top_combined.index[::-1]
values = top_combined.values[::-1]