In [68]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



In [69]:
data = pd.read_csv("Creditcard_data.csv")
data['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


In [70]:
#Balancing data
from sklearn.utils import resample

df_majority = data[data.Class == 0]
df_minority = data[data.Class == 1]

df_majority_down = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)

balanced_data = pd.concat([df_majority_down, df_minority])
balanced_data['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,9
1,9


In [71]:
# simple random sampling
sample_random = data.sample(frac=0.3, random_state=42)


In [72]:
#Systematic Sampling
k = int(len(data) / (0.3 * len(data)))
sample_systematic = data.iloc[::k]


In [73]:
#Stratified Sampling
X_train, _, y_train, _ = train_test_split(
    X, y, test_size=0.7, stratify=y, random_state=42
)

sample_stratified = pd.concat([X_train, y_train], axis=1)


In [82]:
# Bootstrap Sampling (sampling with replacement)
sample_bootstrap = balanced_data.sample(
    frac=1.0, replace=True, random_state=42
)

print(sample_bootstrap["Class"].value_counts())


Class
0    10
1     8
Name: count, dtype: int64


In [83]:
samples["Bootstrap Sampling"] = sample_bootstrap


In [75]:
#cross validation sampling
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, _ in skf.split(X, y):
    sample_cv = data.iloc[train_index]
    break


In [76]:
samples = {
    "Simple Random Sampling": sample_random,
    "Systematic Sampling": sample_systematic,
    "Stratified Sampling": sample_stratified,
    "Bootstrap Sampling": sample_bootstrap,
    "Cross Validation Sampling": sample_cv
}

print(samples.keys())


dict_keys(['Simple Random Sampling', 'Systematic Sampling', 'Stratified Sampling', 'Bootstrap Sampling', 'Cross Validation Sampling'])


In [84]:
results = []


In [78]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}


In [85]:
for s_name, sample in samples.items():
    X_s = sample.drop("Class", axis=1)
    y_s = sample["Class"]

    print(f"Running → {s_name}")   # DEBUG LINE

    if y_s.nunique() < 2:
        print(f"Skipped {s_name} (single class)")
        continue

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, stratify=y_s, random_state=42
    )

    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results.append([s_name, m_name, acc])



Running → Simple Random Sampling
Running → Systematic Sampling
Running → Stratified Sampling
Running → Bootstrap Sampling
Running → Cross Validation Sampling


In [86]:
results_df = pd.DataFrame(
    results, columns=["Sampling Method", "Model", "Accuracy"]
)

print(results_df["Sampling Method"].value_counts())


Sampling Method
Simple Random Sampling       5
Systematic Sampling          5
Stratified Sampling          5
Bootstrap Sampling           5
Cross Validation Sampling    5
Name: count, dtype: int64


In [88]:
final_table = results_df.pivot_table(
    index="Model",
    columns="Sampling Method",
    values="Accuracy",
    aggfunc="mean"
).round(2)

final_table = final_table.reindex(
    columns=[
        "Simple Random Sampling",
        "Systematic Sampling",
        "Stratified Sampling",
        "Bootstrap Sampling",
        "Cross Validation Sampling"
    ]
)

final_table


Sampling Method,Simple Random Sampling,Systematic Sampling,Stratified Sampling,Bootstrap Sampling,Cross Validation Sampling
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree,97.14,92.31,97.14,83.33,97.85
KNN,98.57,98.72,98.57,33.33,98.92
Logistic Regression,98.57,98.72,98.57,100.0,98.92
Random Forest,98.57,98.72,98.57,83.33,98.92
SVM,98.57,98.72,98.57,33.33,98.92


In [89]:
best_sampling_per_model = final_table.idxmax(axis=1)
best_sampling_per_model



Unnamed: 0_level_0,0
Model,Unnamed: 1_level_1
Decision Tree,Cross Validation Sampling
KNN,Cross Validation Sampling
Logistic Regression,Bootstrap Sampling
Random Forest,Cross Validation Sampling
SVM,Cross Validation Sampling


In [90]:
best_overall = final_table.stack().idxmax()
best_overall, final_table.stack().max()


(('Logistic Regression', 'Bootstrap Sampling'), 100.0)