In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE


data=pd.read_csv("Creditcard_data.csv")

X=data.drop("Class",axis=1)
y=data["Class"]

scaler=StandardScaler()
X=scaler.fit_transform(X)

df=pd.DataFrame(X)
df["Class"]=y.values
df.columns=df.columns.astype(str)


smote=SMOTE(random_state=42)
X_bal,y_bal=smote.fit_resample(df.drop("Class",axis=1),df["Class"])

balanced_df=pd.DataFrame(X_bal)
balanced_df["Class"]=y_bal.values
balanced_df.columns=balanced_df.columns.astype(str)


def simple_random_sampling(df,fraction=0.7):
    return df.sample(frac=fraction,random_state=42)

def stratified_sampling(df,fraction=0.7):
    return pd.concat([
        x.sample(frac=fraction,random_state=42)
        for _,x in df.groupby("Class")
    ])

def systematic_sampling(df,fraction=0.7):
    k=int(1/fraction)
    start=np.random.randint(0,k)
    return df.iloc[start::k]

def cluster_sampling(df,n_clusters=10):
    df_copy=df.copy()
    df_copy["Cluster"]=df_copy.index%n_clusters
    selected=np.random.choice(df_copy["Cluster"].unique(),n_clusters//2,replace=False)
    return df_copy[df_copy["Cluster"].isin(selected)].drop("Cluster",axis=1)

def bootstrap_sampling(df):
    return resample(df,replace=True,n_samples=len(df),random_state=42)


sampling_methods={
"SRS":simple_random_sampling,
"Stratified":stratified_sampling,
"Systematic":systematic_sampling,
"Cluster":cluster_sampling,
"Bootstrap":bootstrap_sampling
}


models={
"M1_Logistic":LogisticRegression(max_iter=1000),
"M2_DecisionTree":DecisionTreeClassifier(),
"M3_RandomForest":RandomForestClassifier(),
"M4_KNN":KNeighborsClassifier(),
"M5_NaiveBayes":GaussianNB()
}


results={}

for samp_name,samp_func in sampling_methods.items():
    sampled_df=samp_func(balanced_df)

    X_sampled=sampled_df.drop("Class",axis=1)
    y_sampled=sampled_df["Class"]

    X_train,X_test,y_train,y_test=train_test_split(
        X_sampled,y_sampled,test_size=0.3,random_state=42
    )

    for model_name,model in models.items():
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        acc=accuracy_score(y_test,y_pred)
        results[(model_name,samp_name)]=round(acc*100,2)


result_df=pd.DataFrame(
[(k[0],k[1],v)for k,v in results.items()],
columns=["Model","Sampling","Accuracy (%)"]
)

pivot_df=result_df.pivot(index="Model",columns="Sampling",values="Accuracy (%)")

pd.set_option("display.width",1000)
pd.set_option("display.max_columns",None)

print(pivot_df)


Sampling         Bootstrap  Cluster    SRS  Stratified  Systematic
Model                                                             
M1_Logistic          93.67    89.13  90.65       92.83       91.70
M2_DecisionTree      99.13    96.96  98.13       95.64       97.16
M3_RandomForest     100.00    99.57  99.38       99.69       99.34
M4_KNN               96.29    90.43  95.02       91.28       94.54
M5_NaiveBayes        76.42    73.04  68.54       79.44       73.36
