In [90]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [91]:
data=pd.read_csv(r"C:\Users\HP\Downloads\Creditcard_data.csv")

In [92]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [93]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float64
 22  V2

In [94]:
target_column = 'Class'
data[target_column].value_counts()

Class
0    763
1      9
Name: count, dtype: int64

In [95]:
X= data.drop(target_column, axis=1)  
y= data[target_column]   
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=50, stratify=y)
#splitting dataset to training and testing data
#test size=0.3 refers 30% data is used for testing 
#stratify prevents imbalancing

In [96]:
#applying SMOTE(oversampling) to balance the training data
smote= SMOTE(random_state=50)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [97]:
y_train.value_counts() #original dataset

Class
0    534
1      6
Name: count, dtype: int64

In [98]:
pd.Series(y_train_balanced).value_counts() #pd.series() is used because y_train balanced is NumPy array

Class
0    534
1    534
Name: count, dtype: int64

In [99]:
class_counts= y_train_balanced.value_counts()  
P= class_counts.min() / class_counts.sum()  # Proportion of the minority class
E= 0.05  # Margin of error
Z= 1.96  # 95% confidence

In [100]:
print(P)

0.5


In [101]:
n= (Z ** 2 * P * (1 - P)) / (E ** 2)
n= np.ceil(n) 
#n is the sample size

In [102]:
print(n)

385.0


In [103]:
#creating 5 samples
samples= []

for i in range(5):
    sample= pd.concat([X_train_balanced, y_train_balanced], axis=1)
    sampled_data= data.groupby(target_column, group_keys=False).apply(lambda x: x.sample(n=int(min(len(x), n)), random_state=50))
    Xsample= sample.drop(target_column, axis=1)
    ysample= sample[target_column]
    
    samples.append((Xsample, ysample))


In [104]:
print(samples)

[(      Time        V1        V2        V3        V4        V5        V6  \
0       74  1.038370  0.127486  0.184456  1.109950  0.441699  0.945283   
1      551 -0.474661  0.316097  2.446240  0.961007  0.311652  1.857654   
2      559  0.940965 -1.868811  0.426719 -1.717117 -1.861869 -0.357676   
3      206 -0.370563  0.599810  0.216683 -0.513093  2.697143  3.882795   
4      335 -0.779988  1.395775  1.780828 -0.187325  0.953850 -0.511395   
...    ...       ...       ...       ...       ...       ...       ...   
1063   407 -2.329114  1.834005 -1.547540  3.958434 -0.478728 -1.418192   
1064   315 -1.187440  1.446945 -1.006855  2.955843 -0.474664 -1.314201   
1065   412 -2.284523  1.860270 -1.474778  3.773120 -0.452509 -1.371744   
1066   409 -2.347650  1.704508 -1.479150  3.915112 -0.431028 -1.409024   
1067   155  0.987861  0.323743  0.381401  0.572700 -0.256201 -1.000808   

            V7        V8        V9  ...       V20       V21       V22  \
0    -0.036715  0.350995  0.118950  

In [105]:
sampling_techniques= {
    'S1': SMOTE(random_state=50) ,  
    'S2': RandomUnderSampler(random_state=50),  
    'S3': RandomOverSampler(random_state=50), 
    'S4': TomekLinks(),  
    'S5': NearMiss()
}

In [106]:
models= {
    'M1': RandomForestClassifier(random_state=50) ,
    'M2': DecisionTreeClassifier(random_state=50),
    'M3': LogisticRegression(random_state=50),
    'M4': SVC(random_state=50),
    'M5': KNeighborsClassifier()
}

In [107]:
result= {model: {sampling: [] for sampling in sampling_techniques} for model in models}

# Iterating over each sample
for Xsample, ysample in samples:
    X_train,X_test,y_train,y_test= train_test_split(Xsample, ysample, test_size=0.3, random_state=50, stratify=ysample)
    
    # Iterate over each sampling technique
    for sample_name, sampler in sampling_techniques.items():
        # Resampling the training data
        X_train_resampled,y_train_resampled= sampler.fit_resample(X_train, y_train)
        
        # Scaling the data (important for LogisticRegression and SVC)
        scaler= StandardScaler()
        Xtrain_scaled= scaler.fit_transform(X_train_resampled)
        Xtest_scaled= scaler.transform(X_test)
              
        for model_name, model in models.items():
            model.fit(Xtrain_scaled, y_train_resampled)
            
            y_pred= model.predict(Xtest_scaled)
            accuracy= accuracy_score(y_test, y_pred)
            
            result[model_name][sample_name].append(accuracy)

accuracy_matrix= pd.DataFrame(index=models.keys(), columns=sampling_techniques.keys())

for model_name in models:
    for sample_name in sampling_techniques:
        accuracy_matrix.at[model_name, sample_name] = round(sum(result[model_name][sample_name]) / len(result[model_name][sample_name]), 2)

print(accuracy_matrix)

      S1    S2    S3    S4    S5
M1   1.0   1.0   1.0   1.0   1.0
M2  0.98  0.98  0.98  0.98  0.98
M3  0.95  0.95  0.95  0.95  0.95
M4   1.0   1.0   1.0   1.0   1.0
M5  0.96  0.96  0.96  0.96  0.96
