In [36]:
%reset -f

In [37]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

from collections import Counter

import psutil

from colorama import Style, Fore, Back

In [38]:
df = pd.read_csv('data_Classification.csv')
df.head()

Unnamed: 0,Ft_1,Ft_2,Ft_3,Ft_4,Ft_5,Ft_6,Ft_7,Ft_8,Ft_9,Ft_10,y
0,2.770929,-1.177678,0.001778,1.421301,-4.141069,0.001778,0.31498,0.244916,2.110045,0.001778,0
1,-1.962752,-1.474427,-0.738861,0.561202,-0.299556,-0.738861,2.303809,-1.276648,-1.439053,-0.738861,1
2,-3.694985,-0.572655,-2.441135,0.985736,0.187163,-2.441135,4.291006,1.092934,-1.181399,-2.441135,1
3,2.867535,3.118144,0.313463,-0.681174,0.188393,0.313463,-4.167673,3.571354,0.095224,0.313463,0
4,0.389922,3.470132,-2.063836,-0.255547,1.161201,-2.063836,0.050794,1.862306,-0.532356,-2.063836,0


In [39]:
labelSkew = Counter(df['y'])

for i in labelSkew.keys():
    thisCount = round(100*labelSkew[i]/df.shape[0],1)
    print('Class '+str(i)+':', thisCount, '%')

Class 0: 50.0 %
Class 1: 50.0 %


In [40]:
data = df.drop(['y'], axis=1)
data = pd.get_dummies(data)

target = df['y']

theCols = data.columns

In [41]:
x, xt, y, yt = train_test_split(data, target)

In [42]:
stdScaler = StandardScaler()

stdScaler.fit(x)

x = stdScaler.transform(x)
xt = stdScaler.transform(xt)

## Without Hyperparameter Tuning

In [43]:
baseModel = LogisticRegression(C=0.001, solver='saga', penalty='elasticnet', l1_ratio=0.5, max_iter=2000)

baseModel.fit(x,y);

In [44]:
yp = baseModel.predict(x)
ytp = baseModel.predict(xt)

In [45]:
testAcc = round(100*accuracy_score(y_true=yt, y_pred=ytp), 2)
print('\n\nTest Accuracy:', testAcc, '%\n')

print('Test Confusion Matrix:')
confusion_matrix(y_true=yt, y_pred=ytp)



Test Accuracy: 76.5 %

Test Confusion Matrix:


array([[370, 146],
       [ 89, 395]], dtype=int64)

## With Hyperparameter Tuning

In [46]:
numJobs = psutil.cpu_count(logical=False)

# scrList = ['accuracy', 'neg_log_loss', 'brier_score_loss', 'recall', 'precision', 'f1', 'roc_auc']
scr = 'roc_auc'

In [47]:
cList = np.logspace(-2,2,5)
ratioList = np.linspace(0,1,11)

In [48]:
hyperParams = {'C':cList, 'l1_ratio':ratioList}

best_model = GridSearchCV(estimator=baseModel, param_grid=hyperParams, cv=3, scoring=scr, n_jobs=numJobs, verbose=1)

best_model.fit(x, y);

best_model.best_params_

Fitting 3 folds for each of 55 candidates, totalling 165 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 142 tasks      | elapsed:   11.7s
[Parallel(n_jobs=4)]: Done 165 out of 165 | elapsed:   24.5s finished


{'C': 0.1, 'l1_ratio': 0.7000000000000001}

In [49]:
yp = best_model.best_estimator_.predict(x)
ytp = best_model.best_estimator_.predict(xt)

In [50]:
testAcc = round(100*accuracy_score(y_true=yt, y_pred=ytp), 2)
print('\n\nTest Accuracy:', testAcc, '%\n')

print('Test Confusion Matrix:')
confusion_matrix(y_true=yt, y_pred=ytp)



Test Accuracy: 92.3 %

Test Confusion Matrix:


array([[473,  43],
       [ 34, 450]], dtype=int64)