In [1]:
! git clone https://github.com/AnjulaMehto/Sampling_DataSet.git

Cloning into 'Sampling_DataSet'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 13 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (13/13), 466.53 KiB | 715.00 KiB/s, done.
Resolving deltas: 100% (2/2), done.


In [2]:
import pandas as pd
import numpy as np

np.random.seed(0)

df = pd.read_csv("Sampling_DataSet/Creditcard_data.csv")

sample_size = 5

random_sample = df.sample(n=sample_size, random_state=0)
print(random_sample)

     Time        V1        V2        V3        V4        V5        V6  \
545   409 -0.544922  0.595407  1.813261 -1.344670  0.016864 -0.601398   
679   513  1.255258  0.075190  0.225733  0.881766  0.154508  0.631960   
400   290 -0.695818  0.581773  2.378180  0.063396  0.329119 -0.449865   
14     12 -2.791855 -0.327771  1.641750  1.767473 -0.136588  0.807596   
548   410 -1.086133 -0.704548  2.329021 -0.885715  0.617677  0.478894   

           V7        V8        V9  ...       V21       V22       V23  \
545  0.660876 -0.058978  0.317033  ... -0.123048 -0.148228 -0.076075   
679 -0.385968  0.189493  0.447980  ...  0.088457  0.321206 -0.235167   
400  1.269104 -0.758363  0.381712  ... -0.327948 -0.369683 -0.426987   
14  -0.422911 -1.907107  0.755713  ...  1.151663  0.222182  1.020586   
548 -0.267414  0.354042  0.558999  ...  0.021119  0.213192  0.186858   

          V24       V25       V26       V27       V28  Amount  Class  
545  0.074036 -0.486633  0.724549  0.104294 -0.055110    

In [3]:
import pandas as pd
import math

df = pd.read_csv("Sampling_DataSet/Creditcard_data.csv")

n = len(df)

k = int(math.sqrt(n))

sample = df.iloc[::k]

print(sample.head())

     Time        V1        V2        V3        V4        V5        V6  \
0       0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
27     23  1.322707 -0.174041  0.434555  0.576038 -0.836758 -0.831083   
54     37  1.295668  0.341483  0.081505  0.566746 -0.110459 -0.766325   
81     52  1.147369  0.059035  0.263632  1.211023 -0.044096  0.301067   
108    73  1.162281  1.248178 -1.581317  1.475024  1.138357 -1.020373   

           V7        V8        V9  ...       V21       V22       V23  \
0    0.239599  0.098698  0.363787  ... -0.018307  0.277838 -0.110474   
27  -0.264905 -0.220982 -1.071425  ... -0.284376 -0.323357 -0.037710   
54   0.073155 -0.168304  0.071837  ... -0.323607 -0.929781  0.063809   
81  -0.132960  0.227885  0.252191  ... -0.087813 -0.110756 -0.097771   
108  0.638387 -0.136762 -0.805505  ... -0.124012 -0.227150 -0.199185   

          V24       V25       V26       V27       V28  Amount  Class  
0    0.066928  0.128539 -0.189115  0.133558 -0.021053  14

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN

df = pd.read_csv('Sampling_DataSet/Creditcard_data.csv')

X = df.drop(columns=['Class'])
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

samplers = {
    'Sampling1': SMOTE(),
    'Sampling2': RandomUnderSampler(),
    'Sampling3': RandomOverSampler(),
    'Sampling4': NearMiss(),
    'Sampling5': SMOTEENN()
}

models = {
    'M1': AdaBoostClassifier(),
    'M2': DecisionTreeClassifier(),
    'M3': ExtraTreesClassifier(),
    'M4': LinearDiscriminantAnalysis(),
    'M5': GaussianProcessClassifier()
}

results = pd.DataFrame(columns=["Model", "Sampling", "Accuracy"])

for sampling_name, sampler in samplers.items():
    X_sampled, y_sampled = sampler.fit_resample(X_train, y_train)

    for model_name, model in models.items():
        model.fit(X_sampled, y_sampled)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        new_row = pd.DataFrame([{"Model": model_name, "Sampling": sampling_name, "Accuracy": acc}])
        results = pd.concat([results, new_row], ignore_index=True)

print(results)

best_sampling_technique = results.loc[results.groupby("Model")["Accuracy"].idxmax()]

print("\nBest Sampling Technique for Each Model:")
print(best_sampling_technique)

results.to_csv('sampling_results.csv', index=False)
best_sampling_technique.to_csv('best_sampling_techniques.csv', index=False)


  results = pd.concat([results, new_row], ignore_index=True)


   Model   Sampling  Accuracy
0     M1  Sampling1  0.956897
1     M2  Sampling1  0.956897
2     M3  Sampling1  0.987069
3     M4  Sampling1  0.844828
4     M5  Sampling1  0.935345
5     M1  Sampling2  0.810345
6     M2  Sampling2  0.810345
7     M3  Sampling2  0.780172
8     M4  Sampling2  0.568966
9     M5  Sampling2  0.375000
10    M1  Sampling3  0.987069
11    M2  Sampling3  0.952586
12    M3  Sampling3  0.987069
13    M4  Sampling3  0.823276
14    M5  Sampling3  0.935345
15    M1  Sampling4  0.413793
16    M2  Sampling4  0.474138
17    M3  Sampling4  0.077586
18    M4  Sampling4  0.534483
19    M5  Sampling4  0.534483
20    M1  Sampling5  0.961207
21    M2  Sampling5  0.952586
22    M3  Sampling5  0.982759
23    M4  Sampling5  0.814655
24    M5  Sampling5  0.918103

Best Sampling Technique for Each Model:
   Model   Sampling  Accuracy
10    M1  Sampling3  0.987069
1     M2  Sampling1  0.956897
2     M3  Sampling1  0.987069
3     M4  Sampling1  0.844828
4     M5  Sampling1  0.935345