In [94]:
import pandas as pd
import numpy as np
import scipy.stats as st
from sklearn.metrics import accuracy_score

In [95]:
df = pd.read_csv('Creditcard_data.csv')

In [96]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [97]:
d = dict(df['Class'].value_counts())
d

{0: 763, 1: 9}

In [98]:
def under_sampling(df : pd.DataFrame,target : str)->pd.DataFrame:
    val_counts = dict(df[target].value_counts())
    min_vals = min(list(val_counts.values()))
    und_samp = pd.DataFrame(columns  = df.columns)
    for id in val_counts.keys():
        df1 = df[df[target] == id]
        df1 = df1.sample(frac = 1)
        df1 = df1.iloc[0:min_vals]
        und_samp = pd.concat([und_samp,df1])
    und_samp = und_samp.sample(frac = 1)
    return und_samp

In [99]:
und_df = under_sampling(df,'Class')

In [100]:
und_df['Class'].value_counts()

1    9
0    9
Name: Class, dtype: int64

In [101]:
def over_sampling(df : pd.DataFrame,target : str)->pd.DataFrame:
   
   val_counts = dict(df[target].value_counts())
   max_vals = max(list(val_counts.values()))
   
   ovr_samp = pd.DataFrame(columns  = df.columns)
   
   for id in val_counts.keys():
        
      df1 = df[df[target] == id]
        
      rem = max_vals % df1.shape[0]
      copies = max_vals // df1.shape[0]
      rem_rows = df1.sample(frac = 1).iloc[0:rem]
      
      if copies > 0:
         df1 = pd.concat([df1]*copies)
           
      if rem > 0:
         df1 = pd.concat([df1,rem_rows])
        
      ovr_samp = pd.concat([ovr_samp,df1])
        
      
   ovr_samp = ovr_samp.sample(frac = 1)
   return ovr_samp
     

In [102]:
ovr_df = over_sampling(df,'Class')

In [103]:
ovr_df['Class'].value_counts()

0    763
1    763
Name: Class, dtype: int64

In [104]:
def Simple_Random_Sampling(df,z,e,p = 0.5):
    z = st.norm.ppf(z)
    n = (z**2)*(p*(1-p))//e**2
    rand_sample = df.sample(int(n))
    return rand_sample
    

In [105]:
Simple_Random_Sampling(df,0.95,0.05)['Class'].value_counts()

0    267
1      3
Name: Class, dtype: int64

In [106]:
def Sratified_Sampling(df,tar,z,e,p = 0.5):
    s = len(dict(df[tar].value_counts()))
    z = st.norm.ppf(z)
    n = (z**2)*(p*(1-p))//((e/s)**2)
    rows = df.shape[0]
    strat_sam = df.groupby(tar, group_keys=False).apply(lambda x: x.sample(frac=n/rows))
    return strat_sam

In [107]:
Sratified_Sampling(ovr_df,'Class',0.95,0.05)['Class'].value_counts()

0    541
1    541
Name: Class, dtype: int64

In [108]:
def Cluster_Sampling(df,tar,z,e,c,p = 0.5):
    z = st.norm.ppf(z)
    rows = df.shape[0]
    n = ((z**2)*(p*(1-p))/((e)**2))/(rows - c)
    clust = df.sample(frac=n)
    return clust
    

In [109]:
Cluster_Sampling(ovr_df,'Class',0.95,0.05,300)['Class'].value_counts()

0    176
1    161
Name: Class, dtype: int64

In [110]:
def Systematic_sampling(df,k):
    indexes = np.arange(0, len(df), step=k)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [111]:
Systematic_sampling(ovr_df,5)['Class'].value_counts()

0    164
1    142
Name: Class, dtype: int64

In [112]:
print(max(df['Amount']),min(df['Amount']))

3828.04 0.0


Convenience Sampling is a method of collecting data in which the investigator selects the items from the population that suits his convenience.

In [113]:
def Convenience_Sampling(df,par,amt):
    return df[df[par]<=amt]

Selecting rows where amount is less than 5 

In [114]:
Convenience_Sampling(ovr_df,'Amount',5)['Class'].value_counts()

1    678
0    177
Name: Class, dtype: int64

### Taking Samples from balanced data (Over Sampling) ###

In [115]:
sample1 = Simple_Random_Sampling(ovr_df,0.95,0.05)
sample2 = Systematic_sampling(ovr_df,5)
sample3 = Sratified_Sampling(ovr_df,'Class',0.95,0.05)
sample4 = Cluster_Sampling(ovr_df,'Class',0.95,0.05,300)
sample5 = Convenience_Sampling(ovr_df,'Amount',5)

In [116]:
samples = []
samples.append(sample1)
samples.append(sample2)
samples.append(sample3)
samples.append(sample4)
samples.append(sample5)

### Testing on whole dataset ###

In [117]:
x_test = np.array(df.iloc[:,:-1])
y_test = np.array(df.iloc[:,-1:].values).reshape(-1,).astype('int')

### Model 1  - Logistic Regression ###

In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
logistic_acc = []
for sample in samples:
    classifier1 = make_pipeline(StandardScaler(), LogisticRegression())
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier1.fit(x_train,y_train)
    y_pred = classifier1.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    logistic_acc.append(acc)
    print(acc)
    

0.8290155440414507
0.8471502590673575
0.8652849740932642
0.8251295336787565
0.8095854922279793


### Model 2 - SVM ###

In [119]:
from sklearn.svm import SVC  
svm_acc = []
for sample in samples:
    classifier2 = SVC(kernel='linear') 
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier2.fit(x_train,y_train)
    y_pred = classifier2.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    svm_acc.append(acc)
    print(acc)

0.8264248704663213
0.8238341968911918
0.8691709844559585
0.8432642487046632
0.7448186528497409


### Model 3  - KNN ###

In [120]:
from sklearn.neighbors import KNeighborsClassifier
knn_acc = []
for sample in samples:
    classifier3 = KNeighborsClassifier(n_neighbors=7) 
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier3.fit(x_train,y_train)
    y_pred = classifier3.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    knn_acc.append(acc)
    print(acc)

0.8147668393782384
0.8471502590673575
0.9650259067357513
0.8588082901554405
0.9248704663212435


### Model 4 - Decision Tree ###

In [121]:
from sklearn.tree import DecisionTreeClassifier
dtc_acc = []
for sample in samples:
    classifier4 = DecisionTreeClassifier()
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier4.fit(x_train,y_train)
    y_pred = classifier4.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    dtc_acc.append(acc)
    print(acc)

0.9624352331606217
0.9559585492227979
0.9948186528497409
0.9676165803108808
0.9987046632124352


### Model 5 - Naive Bayes ###

In [122]:
from sklearn.naive_bayes import GaussianNB
gnb_acc = []
for sample in samples:
    classifier5  = GaussianNB()
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier5.fit(x_train,y_train)
    y_pred = classifier5.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    gnb_acc.append(acc)
    print(acc)

0.832901554404145
0.7940414507772021
0.9222797927461139
0.9417098445595855
0.9287564766839378


### COMPARISION ###

In [173]:
metrics = pd.DataFrame()
metrics['models'] = ['Simple_Random_Sampling','Systematic_sampling','Sratified_Sampling','Cluster_Sampling','Convenience_Sampling']
metrics['logistic_regression'] = logistic_acc
metrics['SVM (linear)'] = svm_acc
metrics['KNN (n = 7)'] = knn_acc
metrics['Decision_tree'] = dtc_acc
metrics['Naive_bayes'] = gnb_acc

In [174]:
metrics

Unnamed: 0,models,logistic_regression,SVM (linear),KNN (n = 7),Decision_tree,Naive_bayes
0,Simple_Random_Sampling,0.829016,0.826425,0.814767,0.962435,0.832902
1,Systematic_sampling,0.84715,0.823834,0.84715,0.955959,0.794041
2,Sratified_Sampling,0.865285,0.869171,0.965026,0.994819,0.92228
3,Cluster_Sampling,0.82513,0.843264,0.858808,0.967617,0.94171
4,Convenience_Sampling,0.809585,0.744819,0.92487,0.998705,0.928756


### Applying TOPSIS to find best sampling technique for given dataset ###

In [168]:
import Topsis_Shubham_102067011 as top

In [175]:
metrics1 = metrics.copy()
topsis_metrics = top.normalize(metrics1)
topsis_metrics = top.weightAssignment(topsis_metrics,[1,1,1,1,1])
ideals = top.ideal(topsis_metrics,['+','+','+','+','+'])
dist = top.euclideanDist(topsis_metrics,ideals[0],ideals[1])
scores = top.performance_score(dist[0],dist[1])
ranks = top.TOPSIS_result(topsis_metrics,scores)['Rank']
top.TOPSIS_result(topsis_metrics,scores)

Unnamed: 0,models,logistic_regression,SVM (linear),KNN (n = 7),Decision_tree,Naive_bayes,Performance score,Rank
0,Simple_Random_Sampling,0.443767,0.449317,0.412271,0.44097,0.420452,0.33211,4
1,Systematic_sampling,0.453475,0.447909,0.428657,0.438002,0.400835,0.331952,5
2,Sratified_Sampling,0.463182,0.472558,0.488302,0.455807,0.46557,0.926409,1
3,Cluster_Sampling,0.441687,0.458473,0.434556,0.443344,0.475379,0.607868,2
4,Convenience_Sampling,0.433366,0.404949,0.467983,0.457588,0.46884,0.53939,3


In [177]:
metrics['Performance score'] = scores
metrics['Rank'] = ranks
metrics

Unnamed: 0,models,logistic_regression,SVM (linear),KNN (n = 7),Decision_tree,Naive_bayes,Performance score,Rank
0,Simple_Random_Sampling,0.829016,0.826425,0.814767,0.962435,0.832902,0.33211,4
1,Systematic_sampling,0.84715,0.823834,0.84715,0.955959,0.794041,0.331952,5
2,Sratified_Sampling,0.865285,0.869171,0.965026,0.994819,0.92228,0.926409,1
3,Cluster_Sampling,0.82513,0.843264,0.858808,0.967617,0.94171,0.607868,2
4,Convenience_Sampling,0.809585,0.744819,0.92487,0.998705,0.928756,0.53939,3


Stratified Sampling has the highest performance score by a large margin.

Therefore stratified sampling is the best sampling method for the given dataset.

Decision Tree is the best model with 99.8% accuracy. 