In [1]:
import os
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Datasets Paths
X_train_resampled_path = os.path.join('..','datasets','prepared_data','X_train_resampled.csv') # SMOTE applied  
X_train_reduced_path = os.path.join('..','datasets','prepared_data','X_train_reduced.csv')     # PCA applied (It was resampled before)
X_test_path = os.path.join('..','datasets','prepared_data','X_test_transformed.csv') 
X_test_reduced_path = os.path.join('..','datasets','prepared_data','X_test_reduced.csv') 
y_train_path = os.path.join('..','datasets','prepared_data','y_train_resampled.csv') # SMOTE applied
y_test_path = os.path.join('..','datasets','prepared_data','y_test.csv')

# Load Datasets
X_train_resampled = pd.read_csv(X_train_resampled_path) # PCA Reduced Dataset
X_train_reduced = pd.read_csv(X_train_reduced_path)     # PCA applied (It was resampled before)
X_test_reduced = pd.read_csv(X_test_reduced_path)               # PCA
X_test = pd.read_csv(X_test_path)             
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Ensure y are 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [3]:
# Define the base model
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Define the hyperparameters to search
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization types
    'C': [0.01, 0.1, 1, 10, 100],                 # Inverse of regularization strength
    'solver': ['liblinear', 'saga', 'lbfgs'],     # Optimization algorithms
    'l1_ratio': [0.1, 0.5, 0.9]                   # Only used with 'elasticnet' penalty
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='recall',   # Use 'f1' or 'roc_auc' depending on the problem
    cv=5,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_resampled, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best parameters: {'C': 0.1, 'l1_ratio': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.8166709849495142


In [4]:
# Predict
y_pred = best_model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1552
           1       0.50      0.77      0.61       561

    accuracy                           0.73      2113
   macro avg       0.70      0.75      0.70      2113
weighted avg       0.79      0.73      0.75      2113



In [6]:
# Save trained model
dump(best_model, os.path.join('..','trained_models','logistic_regression.joblib'))

['..\\trained_models\\logistic_regression.joblib']

## With Reduced Dimensionality

In [7]:
# Define the base model
log_reg = LogisticRegression(random_state=42, max_iter=500)

# Define the hyperparameters to search
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization types
    'C': [0.01, 0.1, 1, 10, 100],                 # Inverse of regularization strength
    'solver': ['liblinear', 'saga', 'lbfgs'],     # Optimization algorithms
    'l1_ratio': [0.1, 0.5, 0.9]                   # Only used with 'elasticnet' penalty
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='recall',   # Use 'f1' or 'roc_auc' depending on the problem
    cv=5,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_reduced, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model_w_reduced = grid_search.best_estimator_

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best parameters: {'C': 0.01, 'l1_ratio': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.8128051057344254


In [8]:
# Predict
y_pred = best_model_w_reduced.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1552
           1       0.50      0.77      0.61       561

    accuracy                           0.74      2113
   macro avg       0.70      0.75      0.70      2113
weighted avg       0.79      0.74      0.75      2113

