In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,make_scorer,precision_recall_curve,average_precision_score
from sklearn.linear_model import LogisticRegression, Lasso
import xgboost as xgb



In [12]:
from one_hot import splits 
X_train, X_val, X_test, y_train, y_val, y_test = splits()

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
from collections import Counter

class_counts = Counter(y_train) 
negative_class_count = class_counts[0]
positive_class_count = class_counts[1]
class_weight_ratio = negative_class_count / positive_class_count

In [15]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss',
    early_stopping_rounds=10,
    scale_pos_weight=class_weight_ratio
)

In [16]:
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 5, 10],
    'max_delta_step': [0, 1, 5],
    'lambda': [0, 0.1, 1, 10],
    'alpha': [0, 0.1, 1, 10]
    }

In [17]:
scorer = make_scorer(accuracy_score, average='macro')

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring=scorer,  
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [18]:
grid_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

Fitting 5 folds for each of 248832 candidates, totalling 1244160 fits


In [19]:
print(grid_search.best_params_)
best_model = grid_search.best_estimator_


{'alpha': 0, 'colsample_bytree': 0.7, 'gamma': 0, 'lambda': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.7}


In [20]:
import optimal_threshold

optimal_test_2 = optimal_threshold.roc_auc(best_model, X_test, y_test)
print(f"Optimal threshold: {optimal_test_2:.2f}")

Optimal threshold: 0.82


In [21]:
y_proba = best_model.predict_proba(X_test)[:, 1]  

y_test_pred = np.where(y_proba > optimal_test_2, 1, 0)
accuracy = accuracy_score(y_test, y_test_pred)

conf_matrix = confusion_matrix(y_test, y_test_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_test_pred))

Accuracy: 0.29
Confusion Matrix:
[[12  0]
 [36  3]]
              precision    recall  f1-score   support

           0       0.25      1.00      0.40        12
           1       1.00      0.08      0.14        39

    accuracy                           0.29        51
   macro avg       0.62      0.54      0.27        51
weighted avg       0.82      0.29      0.20        51

