In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

In [2]:
# logistic regreesion

# Best parameters: Best parameters: {'C': 0.001, 'class_weight': {0: 2, 1: 1}, 'fit_intercept': False, 'max_iter': 10000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 1e-07}


lr = LogisticRegression(
    C=0.001,
    class_weight={0: 2, 1: 1},
    fit_intercept=False,
    max_iter=10000,
    multi_class='multinomial',
    penalty='l2',
    solver='newton-cg',
    tol=1e-07
)

# random forest

# Best parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}

rf = RandomForestClassifier(
    class_weight=None,
    criterion='gini',
    max_depth=None,
    max_features=1,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400
)

# SVM

# Best parameters: {'C': 100, 'class_weight': 'balanced', 'coef0': 0, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 100000, 'tol': 1e-05}

svc = SVC(
    C=100,
    class_weight='balanced',
    coef0=0,
    degree=2,
    gamma=0.1,
    kernel='rbf',
    max_iter=100000,
    tol=1e-05,
    probability=True
)

# xgboost

# Best parameters: {'alpha': 1, 'colsample_bytree': 1.0, 'gamma': 0, 'lambda': 1, 'learning_rate': 0.5, 'max_delta_step': 1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators':xx 1000, 'subsample': 1.0}

xgb = XGBClassifier(
    alpha=1,
    colsample_bytree=1.0,
    gamma=0,
    lambda_=1,
    learning_rate=0.5,
    max_delta_step=1,
    max_depth=5,
    min_child_weight=1,
    n_estimators=1000,
    subsample=1.0
)



In [3]:
import one_hot
X_train, X_test, y_train, y_test = one_hot.splits()

In [4]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
svc.fit(X_train, y_train)
xgb.fit(X_train, y_train)

print()




In [5]:
# thresholds for each model

import optimal_threshold
lr_rate = optimal_threshold.f1score(lr, X_test, y_test)
rf_rate = optimal_threshold.f1score(rf, X_test, y_test)
svc_rate = optimal_threshold.f1score(svc, X_test, y_test)
xgb_rate = optimal_threshold.f1score(xgb, X_test, y_test)

print(f"LogisticRegression: {lr_rate:.4f}")
print(f"RandomForest:       {rf_rate:.4f}")
print(f"SVM:                {svc_rate:.4f}")
print(f"XGBoost:            {xgb_rate:.4f}")

thresholds = {
    'lr': lr_rate,
    'rf': rf_rate,
    #'svc': svc_rate,
    #'xgb': xgb_rate
}

LogisticRegression: 0.5241
RandomForest:       0.2950
SVM:                0.3388
XGBoost:            0.5866


In [6]:
proba_lr = lr.predict_proba(X_test)[:, 0]
proba_rf = rf.predict_proba(X_test)[:, 0]
proba_svc = svc.predict_proba(X_test)[:, 0]
proba_xgb = xgb.predict_proba(X_test)[:, 0]

preds_stacked = np.vstack([
    np.where(proba_lr < thresholds['lr'], 1, 0),
    np.where(proba_rf < thresholds['rf'], 1, 0),
    #np.where(proba_svc < thresholds['svc'], 1, 0),
    #np.where(proba_xgb < thresholds['xgb'], 1, 0)
]).T

In [7]:
voting_preds = np.sum(preds_stacked, axis=1) == (len(thresholds))
final_preds = voting_preds.astype(int)

from sklearn.metrics import classification_report, f1_score, roc_auc_score

print("Classification Report:\n", classification_report(y_test, final_preds))
print("ROC AUC:", roc_auc_score(y_test, final_preds))

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.70      0.70        10
           1       0.93      0.93      0.93        41

    accuracy                           0.88        51
   macro avg       0.81      0.81      0.81        51
weighted avg       0.88      0.88      0.88        51

ROC AUC: 0.8134146341463414
