In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import itertools
import statsmodels.api as sm
import kds
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import metrics
from matplotlib import rcParams
from termcolor import colored as cl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# Create a dataframe with imported csv file
df = pd.read_csv(r'C:\Users\kirka\OneDrive\Documents\Coursework\Data Prep\week 5\creditcard.csv')

In [2]:
X = df.drop(['Class','Time','Amount','V6','V8','V13','V15', 'V23'], axis=1)
y = df['Class']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [4]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['Class'], random_state=888)
features = df_train.drop(columns=['Class','Time', 'Amount', 'V6', 'V8', 'V13', 'V15', 'V23']).columns

In [5]:
df_test_y = df_test['Class']
df_test_y

43446     0
170593    0
223374    0
197224    0
41941     0
         ..
198999    0
247149    0
147383    0
175286    0
154478    0
Name: Class, Length: 85443, dtype: int64

In [6]:
smote = SMOTE(random_state=888)
x_resampled, y_resampled = smote.fit_resample(df_train[features], df_train['Class'])

In [7]:
y_resampled.value_counts()

0    199020
1    199020
Name: Class, dtype: int64

In [8]:
clf = LogisticRegression(random_state=888)

In [9]:
clf.fit(x_resampled, y_resampled)
y_pred = clf.predict_proba(df_test[features])[:,1]

In [10]:
roc_auc_score(df_test['Class'], y_pred)

0.9701794883575762

In [11]:
kds.metrics.decile_table(df_test_y, y_pred)

LABELS INFO:

 prob_min         : Minimum probability in a particular decile
 prob_max         : Minimum probability in a particular decile
 prob_avg         : Average probability in a particular decile
 cnt_events       : Count of events in a particular decile
 cnt_resp         : Count of responders in a particular decile
 cnt_non_resp     : Count of non-responders in a particular decile
 cnt_resp_rndm    : Count of responders if events assigned randomly in a particular decile
 cnt_resp_wiz     : Count of best possible responders in a particular decile
 resp_rate        : Response Rate in a particular decile [(cnt_resp/cnt_cust)*100]
 cum_events       : Cumulative sum of events decile-wise 
 cum_resp         : Cumulative sum of responders decile-wise 
 cum_resp_wiz     : Cumulative sum of best possible responders decile-wise 
 cum_non_resp     : Cumulative sum of non-responders decile-wise 
 cum_events_pct   : Cumulative sum of percentages of events decile-wise 
 cum_resp_pct     : Cu

Unnamed: 0,decile,prob_min,prob_max,prob_avg,cnt_cust,cnt_resp,cnt_non_resp,cnt_resp_rndm,cnt_resp_wiz,resp_rate,cum_cust,cum_resp,cum_resp_wiz,cum_non_resp,cum_cust_pct,cum_resp_pct,cum_resp_pct_wiz,cum_non_resp_pct,KS,lift
0,1,0.181,1.0,0.403,8545.0,134.0,8411.0,14.8,148,1.568,8545.0,134.0,148,8411.0,10.001,90.541,100.0,9.861,80.68,9.053
1,2,0.091,0.181,0.126,8544.0,4.0,8540.0,14.8,0,0.047,17089.0,138.0,148,16951.0,20.0,93.243,100.0,19.873,73.37,4.662
2,3,0.059,0.091,0.073,8544.0,4.0,8540.0,14.8,0,0.047,25633.0,142.0,148,25491.0,30.0,95.946,100.0,29.886,66.06,3.198
3,4,0.041,0.059,0.049,8545.0,3.0,8542.0,14.8,0,0.035,34178.0,145.0,148,34033.0,40.001,97.973,100.0,39.9,58.073,2.449
4,5,0.029,0.041,0.035,8544.0,3.0,8541.0,14.8,0,0.035,42722.0,148.0,148,42574.0,50.001,100.0,100.0,49.914,50.086,2.0
5,6,0.019,0.029,0.024,8544.0,0.0,8544.0,14.8,0,0.0,51266.0,148.0,148,51118.0,60.0,100.0,100.0,59.931,40.069,1.667
6,7,0.014,0.019,0.016,8545.0,0.0,8545.0,14.8,0,0.0,59811.0,148.0,148,59663.0,70.001,100.0,100.0,69.949,30.051,1.429
7,8,0.009,0.014,0.011,8544.0,0.0,8544.0,14.8,0,0.0,68355.0,148.0,148,68207.0,80.001,100.0,100.0,79.966,20.034,1.25
8,9,0.005,0.009,0.007,8544.0,0.0,8544.0,14.8,0,0.0,76899.0,148.0,148,76751.0,90.0,100.0,100.0,89.983,10.017,1.111
9,10,0.0,0.005,0.003,8544.0,0.0,8544.0,14.8,0,0.0,85443.0,148.0,148,85295.0,100.0,100.0,100.0,100.0,0.0,1.0


In [12]:
combine_data = pd.DataFrame({"Actual": df_test_y, "Predictions": y_pred})

In [13]:
combine_data['score_rank'] = pd.qcut(combine_data['Predictions'],100,labels=['r1','r2','r3','r4','r5','r6','r7',
    'r8','r9','r10','r11','r12','r13','r14','r15','r16','r17','r18','r19','r20','r21', 'r22', 'r23','r24','r25',
'r26','r27','r28','r29','r30','r31','r32','r33','r34','r35','r36','r37','r38','r39','r40','r41','r42','r43','r44','r45','r46',
'r47','r48','r49','r50','r51','r52','r53','r54','r55','r56','r57','r58','r59','r60','r61','r62','r63','r64','r65','r66','r67',
'r68','r69','r70','r71','r72','r73','r74','r75','r76','r77','r78','r79','r80','r81','r82','r83','r84','r85','r86','r87','r88',
'r89','r90','r91','r92','r93','r94','r95','r96','r97','r98','r99','r100'])

In [14]:
combine_data

Unnamed: 0,Actual,Predictions,score_rank
43446,0,0.012964,r29
170593,0,0.016893,r37
223374,0,0.178298,r90
197224,0,0.093053,r81
41941,0,0.090489,r80
...,...,...,...
198999,0,0.040908,r61
247149,0,0.048909,r66
147383,0,0.028940,r51
175286,0,0.185799,r91


In [15]:
pd.crosstab(combine_data['score_rank'],combine_data['Actual'])

Actual,0,1
score_rank,Unnamed: 1_level_1,Unnamed: 2_level_1
r1,855,0
r2,854,0
r3,855,0
r4,854,0
r5,855,0
...,...,...
r96,858,0
r97,850,1
r98,854,1
r99,852,2


In [16]:
lr_model = sm.Logit(y_resampled, x_resampled)

In [17]:
result = lr_model.fit(method = 'newton')

Optimization terminated successfully.
         Current function value: 0.364017
         Iterations 14


In [18]:
result.summary()

0,1,2,3
Dep. Variable:,Class,No. Observations:,398040.0
Model:,Logit,Df Residuals:,398017.0
Method:,MLE,Df Model:,22.0
Date:,"Sat, 13 Aug 2022",Pseudo R-squ.:,0.4748
Time:,17:51:54,Log-Likelihood:,-144890.0
converged:,True,LL-Null:,-275900.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
V1,-0.6013,0.006,-93.407,0.000,-0.614,-0.589
V2,0.5671,0.006,87.989,0.000,0.554,0.580
V3,-1.3660,0.013,-104.086,0.000,-1.392,-1.340
V4,0.9396,0.008,113.695,0.000,0.923,0.956
V5,-0.8306,0.009,-88.179,0.000,-0.849,-0.812
V7,-1.7031,0.017,-97.768,0.000,-1.737,-1.669
V9,-0.8359,0.009,-90.436,0.000,-0.854,-0.818
V10,-2.0239,0.020,-102.588,0.000,-2.063,-1.985
V11,1.4322,0.014,103.601,0.000,1.405,1.459


In [19]:
result.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.475
Dependent Variable:,Class,AIC:,289832.683
Date:,2022-08-13 17:51,BIC:,290083.252
No. Observations:,398040,Log-Likelihood:,-144890.0
Df Model:,22,LL-Null:,-275900.0
Df Residuals:,398017,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,14.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
V1,-0.6013,0.0064,-93.4068,0.0000,-0.6139,-0.5887
V2,0.5671,0.0064,87.9890,0.0000,0.5544,0.5797
V3,-1.3660,0.0131,-104.0863,0.0000,-1.3917,-1.3403
V4,0.9396,0.0083,113.6955,0.0000,0.9234,0.9558
V5,-0.8306,0.0094,-88.1791,0.0000,-0.8491,-0.8122
V7,-1.7031,0.0174,-97.7683,0.0000,-1.7373,-1.6690
V9,-0.8359,0.0092,-90.4359,0.0000,-0.8540,-0.8178
V10,-2.0239,0.0197,-102.5879,0.0000,-2.0626,-1.9852
V11,1.4322,0.0138,103.6006,0.0000,1.4051,1.4593
