<h1><b>Importing libraries</h1>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import gc

<h1><b>Training the Model</h1>

In [2]:
X = pd.read_parquet('X_processed.parquet')
y = pd.read_parquet('y_processed.parquet')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

del X, y
gc.collect()

0

In [4]:
X_small, _, y_small, _ = train_test_split(
    X_train, y_train,
    train_size=100000,
    random_state=42,
    stratify=y_train
)


In [5]:
log_reg = LogisticRegression(
    class_weight='balanced',
    solver='saga',
    max_iter=700,
    n_jobs=-1
)

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 5, 10]
}

In [None]:
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=2,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_small, y_small)

print("Best Params:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


In [None]:
best_model = LogisticRegression(
    **grid.best_params_,
    class_weight='balanced',
    solver='saga',
    max_iter=700,
    n_jobs=-1
)

best_model.fit(X_train, y_train)
print("Model trained on full data ✅")

<h1><b>Testing the Model</h1>

In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\n📌 Final Metrics on Test Data")
print("ROC-AUC Score :", roc_auc_score(y_test, y_proba))
print("F1 Score      :", f1_score(y_test, y_pred))
print("Accuracy      :", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))