In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import itertools
import statsmodels.api as sm
import kds
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import metrics
from matplotlib import rcParams
from termcolor import colored as cl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# Create a dataframe with imported csv file
df = pd.read_csv(r'C:\Users\kirka\OneDrive\Documents\Coursework\Data Prep\week 5\creditcard.csv')

In [2]:
X = df.drop(['Class','Time','Amount','V6','V8','V13','V15', 'V23'], axis=1)
y = df['Class']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [4]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['Class'], random_state=888)
features = df_train.drop(columns=['Class','Time', 'Amount', 'V6', 'V8', 'V13', 'V15', 'V23']).columns

In [5]:
df_test_y = df_test['Class']
df_test_y

43446     0
170593    0
223374    0
197224    0
41941     0
         ..
198999    0
247149    0
147383    0
175286    0
154478    0
Name: Class, Length: 85443, dtype: int64

In [6]:
msk = df_train['Class'] == 1
num_to_oversample = len(df_train) - 2*msk.sum()
df_positive_oversample = df_train[msk].sample(n=num_to_oversample, replace=True, random_state=888)
df_train_oversample = pd.concat([df_train, df_positive_oversample])
df_train_oversample['Class'].value_counts()

0    199020
1    199020
Name: Class, dtype: int64

In [7]:
clf = LogisticRegression(random_state=888)

In [8]:
clf.fit(df_train_oversample[features], df_train_oversample['Class'])
y_pred = clf.predict_proba(df_test[features])[:,1]

In [9]:
roc_auc_score(df_test['Class'], y_pred)

0.9691078498628766

In [10]:
kds.metrics.decile_table(df_test_y, y_pred)

LABELS INFO:

 prob_min         : Minimum probability in a particular decile
 prob_max         : Minimum probability in a particular decile
 prob_avg         : Average probability in a particular decile
 cnt_events       : Count of events in a particular decile
 cnt_resp         : Count of responders in a particular decile
 cnt_non_resp     : Count of non-responders in a particular decile
 cnt_resp_rndm    : Count of responders if events assigned randomly in a particular decile
 cnt_resp_wiz     : Count of best possible responders in a particular decile
 resp_rate        : Response Rate in a particular decile [(cnt_resp/cnt_cust)*100]
 cum_events       : Cumulative sum of events decile-wise 
 cum_resp         : Cumulative sum of responders decile-wise 
 cum_resp_wiz     : Cumulative sum of best possible responders decile-wise 
 cum_non_resp     : Cumulative sum of non-responders decile-wise 
 cum_events_pct   : Cumulative sum of percentages of events decile-wise 
 cum_resp_pct     : Cu

Unnamed: 0,decile,prob_min,prob_max,prob_avg,cnt_cust,cnt_resp,cnt_non_resp,cnt_resp_rndm,cnt_resp_wiz,resp_rate,cum_cust,cum_resp,cum_resp_wiz,cum_non_resp,cum_cust_pct,cum_resp_pct,cum_resp_pct_wiz,cum_non_resp_pct,KS,lift
0,1,0.182,1.0,0.397,8545.0,134.0,8411.0,14.8,148,1.568,8545.0,134.0,148,8411.0,10.001,90.541,100.0,9.861,80.68,9.053
1,2,0.095,0.182,0.131,8544.0,6.0,8538.0,14.8,0,0.07,17089.0,140.0,148,16949.0,20.0,94.595,100.0,19.871,74.724,4.73
2,3,0.064,0.095,0.077,8544.0,2.0,8542.0,14.8,0,0.023,25633.0,142.0,148,25491.0,30.0,95.946,100.0,29.886,66.06,3.198
3,4,0.044,0.063,0.053,8545.0,2.0,8543.0,14.8,0,0.023,34178.0,144.0,148,34034.0,40.001,97.297,100.0,39.902,57.395,2.432
4,5,0.031,0.044,0.037,8544.0,2.0,8542.0,14.8,0,0.023,42722.0,146.0,148,42576.0,50.001,98.649,100.0,49.916,48.733,1.973
5,6,0.022,0.031,0.026,8544.0,2.0,8542.0,14.8,0,0.023,51266.0,148.0,148,51118.0,60.0,100.0,100.0,59.931,40.069,1.667
6,7,0.016,0.022,0.019,8545.0,0.0,8545.0,14.8,0,0.0,59811.0,148.0,148,59663.0,70.001,100.0,100.0,69.949,30.051,1.429
7,8,0.011,0.016,0.013,8544.0,0.0,8544.0,14.8,0,0.0,68355.0,148.0,148,68207.0,80.001,100.0,100.0,79.966,20.034,1.25
8,9,0.007,0.011,0.009,8544.0,0.0,8544.0,14.8,0,0.0,76899.0,148.0,148,76751.0,90.0,100.0,100.0,89.983,10.017,1.111
9,10,0.0,0.007,0.004,8544.0,0.0,8544.0,14.8,0,0.0,85443.0,148.0,148,85295.0,100.0,100.0,100.0,100.0,0.0,1.0


In [11]:
combine_data = pd.DataFrame({"Actual": df_test_y, "Predictions": y_pred})

In [12]:
combine_data['score_rank'] = pd.qcut(combine_data['Predictions'],100,labels=['r1','r2','r3','r4','r5','r6','r7',
    'r8','r9','r10','r11','r12','r13','r14','r15','r16','r17','r18','r19','r20','r21', 'r22', 'r23','r24','r25',
'r26','r27','r28','r29','r30','r31','r32','r33','r34','r35','r36','r37','r38','r39','r40','r41','r42','r43','r44','r45','r46',
'r47','r48','r49','r50','r51','r52','r53','r54','r55','r56','r57','r58','r59','r60','r61','r62','r63','r64','r65','r66','r67',
'r68','r69','r70','r71','r72','r73','r74','r75','r76','r77','r78','r79','r80','r81','r82','r83','r84','r85','r86','r87','r88',
'r89','r90','r91','r92','r93','r94','r95','r96','r97','r98','r99','r100'])

In [13]:
combine_data

Unnamed: 0,Actual,Predictions,score_rank
43446,0,0.010498,r19
170593,0,0.019718,r38
223374,0,0.192964,r91
197224,0,0.100861,r82
41941,0,0.075756,r75
...,...,...,...
198999,0,0.067820,r72
247149,0,0.111691,r84
147383,0,0.024813,r44
175286,0,0.231917,r93


In [14]:
pd.crosstab(combine_data['score_rank'],combine_data['Actual'])

Actual,0,1
score_rank,Unnamed: 1_level_1,Unnamed: 2_level_1
r1,855,0
r2,854,0
r3,855,0
r4,855,0
r5,854,0
...,...,...
r96,854,1
r97,853,1
r98,853,2
r99,852,2


In [15]:
combine_data.to_csv('combine_data_cc_fraud.csv')

In [19]:
lr_model = sm.Logit(df_train_oversample['Class'],df_train_oversample[features])

In [20]:
result = lr_model.fit(method = 'newton')

Optimization terminated successfully.
         Current function value: 0.365695
         Iterations 14


In [21]:
result.summary()

0,1,2,3
Dep. Variable:,Class,No. Observations:,398040.0
Model:,Logit,Df Residuals:,398017.0
Method:,MLE,Df Model:,22.0
Date:,"Sat, 13 Aug 2022",Pseudo R-squ.:,0.4724
Time:,17:37:18,Log-Likelihood:,-145560.0
converged:,True,LL-Null:,-275900.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
V1,-0.5765,0.006,-92.403,0.000,-0.589,-0.564
V2,0.5178,0.006,84.613,0.000,0.506,0.530
V3,-1.2940,0.013,-102.881,0.000,-1.319,-1.269
V4,0.9018,0.008,113.249,0.000,0.886,0.917
V5,-0.7525,0.009,-86.523,0.000,-0.770,-0.735
V7,-1.6344,0.017,-96.595,0.000,-1.668,-1.601
V9,-0.7862,0.009,-88.807,0.000,-0.804,-0.769
V10,-1.8938,0.019,-100.474,0.000,-1.931,-1.857
V11,1.3558,0.013,102.220,0.000,1.330,1.382


In [22]:
result.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.472
Dependent Variable:,Class,AIC:,291168.2152
Date:,2022-08-13 17:37,BIC:,291418.7842
No. Observations:,398040,Log-Likelihood:,-145560.0
Df Model:,22,LL-Null:,-275900.0
Df Residuals:,398017,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,14.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
V1,-0.5765,0.0062,-92.4035,0.0000,-0.5887,-0.5642
V2,0.5178,0.0061,84.6126,0.0000,0.5058,0.5298
V3,-1.2940,0.0126,-102.8813,0.0000,-1.3186,-1.2693
V4,0.9018,0.0080,113.2487,0.0000,0.8862,0.9174
V5,-0.7525,0.0087,-86.5234,0.0000,-0.7695,-0.7354
V7,-1.6344,0.0169,-96.5951,0.0000,-1.6675,-1.6012
V9,-0.7862,0.0089,-88.8073,0.0000,-0.8036,-0.7689
V10,-1.8938,0.0188,-100.4744,0.0000,-1.9308,-1.8569
V11,1.3558,0.0133,102.2199,0.0000,1.3299,1.3818
