In [10]:
import os
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler

# Define function to compute the performance metrics for logistic regression
def compute_metrics(X, y, smote=False, model_name='model'):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler=MinMaxScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)

    # Apply SMOTE if specified
    if smote:
        sm = SMOTE(random_state=27)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Save model with best performance
    joblib.dump(model, f'{model_name}.pkl')
    
    # Predict on test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Compute metrics
    auc = roc_auc_score(y_test, y_prob)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return auc, recall, precision, f1

def load_data(file_name):
    curr_file = os.path.join(os.getcwd().replace('models', 'data'), file_name)
    return pd.read_csv(curr_file, delimiter=",")

# Load data
data = load_data('data.csv')

# Extract features and labels for each dataset
X_data, y_data = data.drop('Outcome', axis=1), data['Outcome']

# Compute metrics for each dataset
metrics_data = compute_metrics(X_data, y_data, model_name='logistic_regression_data')
metrics_data_smote = compute_metrics(X_data, y_data, smote=True, model_name='logistic_regression_smote')

# Combine metrics into a single table
metrics_table = pd.DataFrame({
    'Dataset': ['data', 'data_smote'],
    'AUC': [metrics_data[0], metrics_data_smote[0]],
    'Recall': [metrics_data[1], metrics_data_smote[1]],
    'Precision': [metrics_data[2], metrics_data_smote[2]],
    'F1': [metrics_data[3], metrics_data_smote[3]]
})

In [11]:
# Vizuallize metrics table
display(metrics_table)

Unnamed: 0,Dataset,AUC,Recall,Precision,F1
0,data,0.850498,0.837209,0.654545,0.734694
1,data_smote,0.848686,0.837209,0.666667,0.742268


In [12]:
import numpy as np

# Number of trees in random forest
C = np.arange(0.1, 1, 0.01)

warm_start = [True, False]
max_iter = range(1, 1000)
solver = ['lbfgs', 'newton-cg', 'liblinear']

random_grid ={
    'max_iter' : max_iter,
    'warm_start' : warm_start,
    'solver' : solver,
    'C' : C,
}

In [13]:
# Load data
data = load_data('data.csv')

# Extract features and labels for each dataset
X_data, y_data = data.drop('Outcome', axis=1), data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [15]:
# Random search of parameters, using 3 fold cross validation, 
from sklearn.model_selection import RandomizedSearchCV

model = LogisticRegression(solver='liblinear', class_weight='balanced')
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = model, param_distributions=random_grid, scoring = 'f1_macro', n_iter = 500, cv = 2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 2 folds for each of 500 candidates, totalling 1000 fits


In [16]:
import warnings
warnings.filterwarnings("ignore")

model = rf_random.best_estimator_
# Predict on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Compute metrics
auc = roc_auc_score(y_test, y_prob)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred,average='weighted')

In [17]:
# Combine metrics into a single table  
metrics_table = pd.DataFrame({
    'Dataset': ['data'],
    'AUC': [auc],
    'Recall': [recall],
    'Precision': [precision],
    'F1': [f1]
})

In [18]:
metrics_table.head()

Unnamed: 0,Dataset,AUC,Recall,Precision,F1
0,data,0.852613,0.791667,0.8125,0.795467


In [19]:
joblib.dump(model, f'logistic_regression_best.pkl')

['logistic_regression_best.pkl']