In [1]:
###-----------------
### Import Libraries
###-----------------

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
brupt=pd.read_csv("data.csv")

brupt.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [3]:
X=brupt.drop('Bankrupt?',axis=1)
y=brupt['Bankrupt?']

In [4]:
print(y.value_counts(normalize=True)*100)

Bankrupt?
0    96.77372
1     3.22628
Name: proportion, dtype: float64


In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=23,stratify=y)

print(y_train.value_counts(normalize=True)*100)
print(y_test.value_counts(normalize=True)*100)

Bankrupt?
0    96.773518
1     3.226482
Name: proportion, dtype: float64
Bankrupt?
0    96.774194
1     3.225806
Name: proportion, dtype: float64


In [6]:
lr=LogisticRegression()

lr.fit(X_train,y_train)

# Log loss
y_pred_prob=lr.predict_proba(X_test)[:,1]
print("Log_loss:- ",log_loss(y_test,y_pred_prob))

# Accuracy Score
y_pred=lr.predict(X_test)
print("Accuracy_Score:- ",accuracy_score(y_test,y_pred))

Log_loss:-  0.24074456728610108
Accuracy_Score:-  0.9618768328445748


# Kfold

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [8]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

kfoldd=KFold(n_splits=5,shuffle=True,random_state=23)

lr=LogisticRegression()

In [9]:
#Default scoring =Acc scoring

results=cross_val_score(lr,X,y,cv=kfold)

print(results.mean())

0.961871453990357


In [10]:
#log loss 
results=cross_val_score(lr,X,y,cv=kfold,scoring='neg_log_loss')
print(results.mean())

-0.18365244634468467


In [11]:
#GridSearch
from sklearn.model_selection import GridSearchCV

lr=LogisticRegression()
params={'penalty':["l1","l2","elastic",None]}


gcv=GridSearchCV(lr,param_grid=params,cv=kfold)   #Default scoring-acc score
gcv.fit(X,y)

print(gcv.best_params_)
print(gcv.best_score_)

{'penalty': 'l2'}
0.961871453990357


In [12]:
#logloss
from sklearn.model_selection import GridSearchCV
lr=LogisticRegression()

params={'penalty':["l1","l2","elastic",None],
       'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring="neg_log_loss")   #Default scoring-acc score
gcv.fit(X,y)


print(gcv.best_params_)
print(gcv.best_score_)

{'penalty': 'l1', 'solver': 'liblinear'}
-0.10877435989398568


# Breast_cancer model

In [13]:
cancer=pd.read_csv('BreastCancer.csv')
cancer

Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant
3,95719,6,10,10,10,8,10,7,10,7,Malignant
4,128059,1,1,1,1,2,5,5,1,1,Benign
...,...,...,...,...,...,...,...,...,...,...,...
694,1369821,10,10,10,10,5,10,10,10,7,Malignant
695,1371026,5,10,10,10,4,10,5,6,3,Malignant
696,1371920,5,1,1,1,2,1,3,2,1,Benign
697,8233704,4,1,1,1,1,1,2,1,1,Benign


In [14]:
X=cancer.drop('Class',axis=1)
y=cancer['Class']

In [15]:
print(y.value_counts(normalize=True)*100)
print(y.value_counts())

Class
Benign       65.522175
Malignant    34.477825
Name: proportion, dtype: float64
Class
Benign       458
Malignant    241
Name: count, dtype: int64


In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=23,stratify=y)

In [17]:
print(y_train.value_counts(normalize=True)*100)
print(y_train.value_counts())

Class
Benign       65.439673
Malignant    34.560327
Name: proportion, dtype: float64
Class
Benign       320
Malignant    169
Name: count, dtype: int64


In [18]:
print(y_test.value_counts(normalize=True)*100)
print(y_test.value_counts())

Class
Benign       65.714286
Malignant    34.285714
Name: proportion, dtype: float64
Class
Benign       138
Malignant     72
Name: count, dtype: int64


In [19]:
#logloss
from sklearn.model_selection import GridSearchCV
lr=LogisticRegression()

params={'penalty':["l1","l2","elastic",None],
       'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}


gcv=GridSearchCV(lr,param_grid=params,cv=kfold)
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

best_model=gcv.best_estimator_
print(best_model.coef_)

{'penalty': 'l2', 'solver': 'newton-cg'}
0.9693035977277509
[[-5.34950681e-07  5.80850721e-01 -2.00884763e-01  4.89847419e-01
   6.73372907e-02  2.26574286e-01  4.95736977e-01  4.82768095e-01
   2.79816844e-01  6.58330964e-01]]


In [20]:
#logloss
from sklearn.model_selection import GridSearchCV
lr=LogisticRegression()

params={'penalty':["l1","l2","elastic",None],
       'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring="neg_log_loss")
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

best_model=gcv.best_estimator_
print(best_model.coef_)

{'penalty': 'l2', 'solver': 'newton-cg'}
-0.09255242289692968
[[-5.34950681e-07  5.80850721e-01 -2.00884763e-01  4.89847419e-01
   6.73372907e-02  2.26574286e-01  4.95736977e-01  4.82768095e-01
   2.79816844e-01  6.58330964e-01]]


In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
lr=LogisticRegression()

lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)

print(confusion_matrix(y_test,y_pred))  #Resolve Later

[[138   0]
 [ 72   0]]


# Human Resources 

In [23]:
hr=pd.read_csv('HR_comma_sep.csv')

hr=pd.get_dummies(hr,drop_first=True)

In [24]:
X=hr.drop('left',axis=1)
y=hr['left']

In [25]:
print(y.value_counts(normalize=True)*100)
print(y.value_counts())

left
0    76.212071
1    23.787929
Name: proportion, dtype: float64
left
0    11428
1     3567
Name: count, dtype: int64


In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=23,stratify=y)

In [27]:
print(y_train.value_counts(normalize=True)*100)
print(y_train.value_counts())

left
0    76.209985
1    23.790015
Name: proportion, dtype: float64
left
0    7999
1    2497
Name: count, dtype: int64


In [28]:
print(y_test.value_counts(normalize=True)*100)
print(y_test.value_counts())

left
0    76.216937
1    23.783063
Name: proportion, dtype: float64
left
0    3429
1    1070
Name: count, dtype: int64


In [29]:
#logloss
from sklearn.model_selection import GridSearchCV
lr=LogisticRegression()

params={'penalty':["l1","l2","elastic",None],
       'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

gcv=GridSearchCV(lr,param_grid=params,cv=kfold)
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

best_model=gcv.best_estimator_
print(best_model.coef_)

{'penalty': None, 'solver': 'lbfgs'}
0.8021166995621497
[[-4.19907501e+00  3.78754728e-01 -3.09399703e-01  3.94123590e-03
   2.47091080e-01 -1.51816966e+00 -8.71062734e-01 -4.03988518e-01
   1.79259722e-01  3.82947577e-01 -3.90033629e-01  1.05734573e-02
  -9.09917486e-02 -6.08129091e-02  7.18306675e-02  1.65114212e-01
   1.58758144e+00  9.77932441e-01]]


In [30]:
#logloss
from sklearn.model_selection import GridSearchCV
lr=LogisticRegression()

params={'penalty':["l1","l2","elastic",None],
       'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}

gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring="neg_log_loss")
gcv.fit(X_train,y_train)

print(gcv.best_params_)
print(gcv.best_score_)

best_model=gcv.best_estimator_
print(best_model.coef_)

{'penalty': 'l2', 'solver': 'newton-cholesky'}
-0.4292428792432032
[[-4.08258833  0.72958783 -0.3170779   0.00417777  0.26650042 -1.5316755
  -1.44514714 -0.31000287  0.25515388  0.42687326 -0.11542405  0.08892447
  -0.04670069  0.11588066  0.25073255  0.34174763  1.91361464  1.33148555]]
