In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN

# ----------------------------
# 1. Load Dataset
# ----------------------------
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

X = data.drop("Class", axis=1)
y = data["Class"]

# Standardize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ----------------------------
# 2. Sampling Techniques
# ----------------------------
samplers = {
    "RandomUnder": RandomUnderSampler(),
    "RandomOver": RandomOverSampler(),
    "SMOTE": SMOTE(),
    "NearMiss": NearMiss(),
    "SMOTEENN": SMOTEENN()
}

# ----------------------------
# 3. Models
# ----------------------------
models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "NaiveBayes": GaussianNB()
}

# ----------------------------
# 4. Run experiments
# ----------------------------
results = pd.DataFrame()

for samp_name, sampler in samplers.items():
    X_res, y_res = sampler.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        results.loc[model_name, samp_name] = round(acc * 100, 2)

# ----------------------------
# 5. Print results
# ----------------------------
print("\nAccuracy Table (%):")
print(results)

# Best sampling for each model
print("\nBest Sampling Technique per Model:")
print(results.idxmax(axis=1))



Accuracy Table (%):
              RandomUnder  RandomOver  SMOTE  NearMiss  SMOTEENN
Logistic            33.33       91.70  91.48    100.00     93.64
DecisionTree        50.00       98.69  96.29     83.33     97.50
RandomForest        33.33      100.00  99.34     83.33     99.32
KNN                 16.67       96.51  94.54     83.33     97.73
NaiveBayes          66.67       77.51  73.36     83.33     71.14

Best Sampling Technique per Model:
Logistic          NearMiss
DecisionTree    RandomOver
RandomForest    RandomOver
KNN               SMOTEENN
NaiveBayes        NearMiss
dtype: object
