In [20]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

np.random.seed(0)

df = pd.read_csv("/content/Creditcard_data.csv")
print(df['Class'].value_counts())


Class
0    763
1      9
Name: count, dtype: int64


In [21]:
X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [36]:
def simple_random_sampling(df):
    fraud = df[df['Class'] == 1]
    normal = df[df['Class'] == 0]
    normal_sample = normal.sample(n=len(fraud), random_state=0)
    return pd.concat([fraud, normal_sample])

In [37]:
def systematic_sampling(df):
    fraud = df[df['Class'] == 1]
    normal = df[df['Class'] == 0]
    k = int(math.sqrt(len(normal)))
    normal_sample = normal.iloc[::k].head(len(fraud))
    return pd.concat([fraud, normal_sample])



In [38]:
def stratified_sampling(df):
    fraud = df[df['Class'] == 1]
    normal = df[df['Class'] == 0]
    normal_sample = normal.sample(n=len(fraud), random_state=0)
    return pd.concat([fraud, normal_sample])



In [39]:
def cluster_sampling(df):
    fraud = df[df['Class'] == 1]
    normal = df[df['Class'] == 0]
    normal_sample = normal.sample(n=len(fraud), random_state=0)
    return pd.concat([fraud, normal_sample])



In [40]:
def bootstrap_sampling(df):
    fraud = df[df['Class'] == 1]
    normal = df[df['Class'] == 0]
    normal_sample = resample(normal, n_samples=len(fraud), random_state=0)
    return pd.concat([fraud, normal_sample])



In [41]:
samples = {
    "Sampling1_Random": simple_random_sampling(df),
    "Sampling2_Systematic": systematic_sampling(df),
    "Sampling3_Stratified": stratified_sampling(df),
    "Sampling4_Cluster": cluster_sampling(df),
    "Sampling5_Bootstrap": bootstrap_sampling(df)
}


In [42]:
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=500),
    "M2_KNN": KNeighborsClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_SVM": SVC(),
    "M5_GradientBoosting": GradientBoostingClassifier()
}



In [43]:
results = pd.DataFrame(columns=["Model", "Sampling", "Accuracy"])

for samp_name, samp_df in samples.items():
    X = samp_df.drop('Class', axis=1)
    y = samp_df['Class']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0, stratify=y
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.loc[len(results)] = [model_name, samp_name, round(acc*100,2)]


In [44]:
pivot = results.pivot(index="Model", columns="Sampling", values="Accuracy")
pivot


Sampling,Sampling1_Random,Sampling2_Systematic,Sampling3_Stratified,Sampling4_Cluster,Sampling5_Bootstrap
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1_LogisticRegression,50.0,100.0,50.0,50.0,50.0
M2_KNN,50.0,100.0,50.0,50.0,50.0
M3_RandomForest,75.0,50.0,75.0,50.0,75.0
M4_SVM,75.0,75.0,75.0,75.0,50.0
M5_GradientBoosting,25.0,75.0,25.0,25.0,50.0


In [45]:
best = results.loc[results.groupby("Model")["Accuracy"].idxmax()]
best


Unnamed: 0,Model,Sampling,Accuracy
5,M1_LogisticRegression,Sampling2_Systematic,100.0
6,M2_KNN,Sampling2_Systematic,100.0
2,M3_RandomForest,Sampling1_Random,75.0
3,M4_SVM,Sampling1_Random,75.0
9,M5_GradientBoosting,Sampling2_Systematic,75.0
