In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,make_scorer,precision_recall_curve, roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb


In [2]:
from one_hot import splits 
X_train, X_val, X_test, y_train, y_val, y_test = splits()

In [3]:
# logistic regression
# Best parameters: {'C': 10, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 0.01}

logistic = LogisticRegression(
    tol = 0.01,
    C = 10,
    class_weight = 'balanced',
    max_iter = 100,
    solver = 'newton-cg',
    penalty = 'l2',
    fit_intercept = True,
    random_state = 50)

In [4]:
# svm
# Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

svm = SVC(
    C = 0.1,
    kernel = 'rbf',
    gamma = 'scale',
    degree = 2,
    class_weight = 'balanced',
    probability = True,
    random_state = 50
)

In [5]:
# random forest
# Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

random_forest = RandomForestClassifier(
    criterion = 'gini',
    n_estimators = 100,
    max_depth = 5,
    min_samples_split = 2,
    min_samples_leaf = 1,
    max_features = 'sqrt',
    class_weight = 'balanced',
    random_state=50)

In [6]:
# xgboost
# Best parameters: {'alpha': 0, 'colsample_bytree': 0.7, 'gamma': 0, 'lambda': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.7}

xgb = xgb.XGBClassifier(
    alpha = 0,
    colsample_bytree = 0.7,
    gamma = 0,
    lambda_ = 0,
    learning_rate = 0.1,
    max_delta_step = 0,
    max_depth = 3,
    min_child_weight = 1,
    n_estimators = 50,
    subsample = 0.7,
    scale_pos_weight = 1,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 50
)

In [7]:
logistic.fit(X_train, y_train)

svm.fit(X_train, y_train)

random_forest.fit(X_train, y_train)

xgb.fit(X_train, y_train)

In [8]:
y_proba_logistic = logistic.predict_proba(X_test)[:, 1]
y_proba_svm = svm.predict_proba(X_test)[:, 1]
y_proba_rf = random_forest.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

In [9]:
import optimal_threshold

optimal_logistic = optimal_threshold.roc_auc(logistic, X_test, y_test)
print(optimal_logistic)

optimal_swm = optimal_threshold.roc_auc(svm, X_test, y_test)
print(optimal_swm)

optimal_rf = optimal_threshold.roc_auc(random_forest, X_test, y_test)
print(optimal_rf)

optimal_xgb = optimal_threshold.roc_auc(xgb, X_test, y_test)
print(optimal_xgb)

0.1991505222578659
0.8711454395748063
0.8060341162932524
0.94771975


In [10]:
y_pred_logistic = (y_proba_logistic >= optimal_logistic).astype(int)
y_pred_svm = (y_proba_svm >= optimal_swm).astype(int)
y_pred_rf = (y_proba_rf >= optimal_rf).astype(int)
y_pred_xgb = (y_proba_xgb >= optimal_xgb).astype(int)

In [11]:
voting_hard = VotingClassifier(
    estimators=[
        ('logistic', logistic),
        ('svm', svm),
        ('random_forest', random_forest),
        ('xgb', xgb)
    ],
    voting='hard'
)


voting_soft = VotingClassifier(
    estimators=[
        ('logistic', logistic),
        ('svm', svm),
        ('random_forest', random_forest),
        ('xgb', xgb)
    ],
    voting='soft'
)

In [12]:
voting_hard.fit(X_train, y_train)
y_pred_hard = voting_hard.predict(X_test)
accuracy_hard = accuracy_score(y_test, y_pred_hard)

In [13]:
conf_matrix = confusion_matrix(y_test, y_pred_hard)
print(f'Accuracy: {accuracy_hard:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_pred_hard))

Accuracy: 0.61
Confusion Matrix:
[[ 5  7]
 [13 26]]
              precision    recall  f1-score   support

           0       0.28      0.42      0.33        12
           1       0.79      0.67      0.72        39

    accuracy                           0.61        51
   macro avg       0.53      0.54      0.53        51
weighted avg       0.67      0.61      0.63        51



In [14]:
voting_soft.fit(X_train, y_train)
y_pred_soft = voting_soft.predict(X_test)
accuracy_soft = accuracy_score(y_test, y_pred_soft)

In [15]:
conf_matrix = confusion_matrix(y_test, y_pred_soft)
print(f'Accuracy: {accuracy_soft:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_pred_soft))

Accuracy: 0.73
Confusion Matrix:
[[ 0 12]
 [ 2 37]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.76      0.95      0.84        39

    accuracy                           0.73        51
   macro avg       0.38      0.47      0.42        51
weighted avg       0.58      0.73      0.64        51

