- In this notebook the independent variables are scaled using StandardScaler() on all features
- We are focusing on area under precision recall curve(Higher the better)
- Since there is a class imbalance we try solving the imbalance by hyperparameter tuning ,oversampling and smote methods

In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.model_selection import train_test_split, GridSearchCV

import sklearn.metrics

from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import Build_Evaluate_Model as bem

### OBTAINING TRAIN AND TEST SET (STANDARDSCALER)

In [2]:
X_train,y_train,X_test,y_test=bem.get_xy_traintest(scale=True,scaler='StandardScaler')

### BASIC LOGISTIC REGRESSION MODEL

In [3]:
log_score_standard=bem.build_basic_model(X_train,y_train,X_test,y_test,classifier='Logistic Regression')

In [4]:
log_score_standard

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.1815299983877598, 0.15970442609984317, 0.40...","[0.3738901029193777, 0.3417548278432792, 0.257...",0.77731,0.7543,0.02301


### CLASS WEIGHT PARAMETER

In [5]:
w={0:22,1:77}

log_cw=LogisticRegression(max_iter=1000,random_state=42,class_weight=w,penalty='l2')

In [6]:
log_score_standard=bem.build_model(X_train,y_train,X_test,y_test,classifier=log_cw,classifier_name='log_cw',score_df=log_score_standard)

In [7]:
log_score_standard

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.1815299983877598, 0.15970442609984317, 0.40...","[0.3738901029193777, 0.3417548278432792, 0.257...",0.77731,0.7543,0.02301
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4366352273221064, 0.39745337648903717, 0.70...","[0.7002594095540251, 0.6557012917693645, 0.544...",0.777623,0.755057,0.022565


### HYPERPARAMETER TUNING

In [8]:
w=[{0:1.0,1:10},{0:1,1:100}, {0:1,1:150},{0:1,1:1},
     {0:1.0,1:200},{0:1.0,1:500},{0:100,1:1000},{0:22,1:77},{0:30,1:70},{0:40,1:60},{0:20,1:80},{0:10,1:90}]
c_values = [100, 10, 1.0, 0.1, 0.01]


In [9]:
hyperparam_grid = {"class_weight": w ,'solver':['liblinear'],'penalty':['l1'],'C':c_values}

In [10]:
log_search_wt=LogisticRegression(max_iter=500,random_state=42)

grid = GridSearchCV(log_search_wt,hyperparam_grid,scoring="roc_auc", cv=3, n_jobs=-1)
grid.fit(X_train,y_train)

print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.7706366788959484 with param: {'C': 0.1, 'class_weight': {0: 1.0, 1: 10}, 'penalty': 'l1', 'solver': 'liblinear'}


- The above  code gives  {'C': 0.1, 'class_weight': {0: 1.0, 1: 10}, 'penalty': 'l1', 'solver': 'liblinear'}


In [11]:
log_best_1=LogisticRegression(C= 0.1, class_weight= {0:1, 1:10}, penalty= 'l1', solver= 'liblinear')

In [12]:
log_score_standard=bem.build_model(X_train,y_train,X_test,y_test,classifier=log_best_1,classifier_name='log_best_1',score_df=log_score_standard)

In [13]:
log_score_standard

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.1815299983877598, 0.15970442609984317, 0.40...","[0.3738901029193777, 0.3417548278432792, 0.257...",0.77731,0.7543,0.02301
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4366352273221064, 0.39745337648903717, 0.70...","[0.7002594095540251, 0.6557012917693645, 0.544...",0.777623,0.755057,0.022565
2,log_best_1,"LogisticRegression(C=0.1, class_weight={0: 1, ...","[0.6921089194560377, 0.6537096526005286, 0.872...","[0.8731732761987303, 0.8475063212196383, 0.772...",0.7776,0.755401,0.022198


In [14]:
c_values = [10,25,35,45,50,85,100,150]

hyperparam_grid = {"class_weight":[{0:1,1:10}] ,'solver':['liblinear'],'penalty':['l1'],'C':c_values}

log_search_wt=LogisticRegression(max_iter=500,random_state=42)

grid = GridSearchCV(log_search_wt,hyperparam_grid,scoring="roc_auc", cv=3, n_jobs=-1)
grid.fit(X_train,y_train)

print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.7705945810426488 with param: {'C': 10, 'class_weight': {0: 1, 1: 10}, 'penalty': 'l1', 'solver': 'liblinear'}


Best score: 0.7705945810426488 with param: {'C': 10, 'class_weight': {0: 1, 1: 10}, 'penalty': 'l1', 'solver': 'liblinear'}

In [16]:
log_best_2=LogisticRegression(C= 10, class_weight= {0:1, 1:10}, penalty= 'l1', solver= 'liblinear')

In [17]:
log_score_standard=bem.build_model(X_train,y_train,X_test,y_test,classifier=log_best_2,classifier_name='log_best_2',score_df=log_score_standard)

In [18]:
log_score_standard

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.1815299983877598, 0.15970442609984317, 0.40...","[0.3738901029193777, 0.3417548278432792, 0.257...",0.77731,0.7543,0.02301
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4366352273221064, 0.39745337648903717, 0.70...","[0.7002594095540251, 0.6557012917693645, 0.544...",0.777623,0.755057,0.022565
2,log_best_1,"LogisticRegression(C=0.1, class_weight={0: 1, ...","[0.6921089194560377, 0.6537096526005286, 0.872...","[0.8731732761987303, 0.8475063212196383, 0.772...",0.7776,0.755401,0.022198
3,log_best_2,"LogisticRegression(C=10, class_weight={0: 1, 1...","[0.6913220094781583, 0.6537217142891872, 0.874...","[0.877489793337063, 0.8479177492863919, 0.7721...",0.777623,0.755371,0.022252


### LOGISTIC REGRESSION ENSEMBLE

In [19]:
log_score_minmax=bem.build_ensemble(X_train,y_train,X_test,y_test,classifier_name='log_standard',score_df=log_score_standard)

In [20]:
log_score_minmax.sort_values(by='DIFFERENCE')

Unnamed: 0,MODEL,PARAMS,y_train_prob,y_test_prob,TRAIN SCORE,TEST SCORE,DIFFERENCE
2,log_best_1,"LogisticRegression(C=0.1, class_weight={0: 1, ...","[0.6921089194560377, 0.6537096526005286, 0.872...","[0.8731732761987303, 0.8475063212196383, 0.772...",0.7776,0.755401,0.022198
3,log_best_2,"LogisticRegression(C=10, class_weight={0: 1, 1...","[0.6913220094781583, 0.6537217142891872, 0.874...","[0.877489793337063, 0.8479177492863919, 0.7721...",0.777623,0.755371,0.022252
1,log_cw,"LogisticRegression(class_weight={0: 22, 1: 77}...","[0.4366352273221064, 0.39745337648903717, 0.70...","[0.7002594095540251, 0.6557012917693645, 0.544...",0.777623,0.755057,0.022565
4,log_standard_ensemble,,"[0.5003990386610155, 0.466147292369649, 0.7150...","[0.706203145502299, 0.6732200475296685, 0.5867...",0.777705,0.755093,0.022612
0,Basic,"LogisticRegression(max_iter=500, random_state=42)","[0.1815299983877598, 0.15970442609984317, 0.40...","[0.3738901029193777, 0.3417548278432792, 0.257...",0.77731,0.7543,0.02301


- Our best model for logistic using standard scaler is log_best_1