In [4]:
import pandas as pd

# -------------------- Load Dataset --------------------
data_path = "/kaggle/input/creditcard-data/Creditcard_data.csv"
data = pd.read_csv(data_path)

print(data.head())
print("\nClass Distribution:\n", data["Class"].value_counts())

# -------------------- Feature & Target Split --------------------
X = data.drop("Class", axis=1)
y = data["Class"]

# -------------------- Train-Test Split --------------------
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# -------------------- Sampling Techniques --------------------
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

sampling_methods = {
    "ROS": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "RUS": RandomUnderSampler(random_state=42),
    "SMOTE_ENN": SMOTEENN(random_state=42),
    "SMOTE_TOMEK": SMOTETomek(random_state=42)
}

# -------------------- Models --------------------
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "NaiveBayes": GaussianNB(),
    "SVM": SVC()
}

# -------------------- Evaluation --------------------
from sklearn.metrics import accuracy_score

results = {}

for s_name, sampler in sampling_methods.items():
    X_res, y_res = sampler.fit_resample(X_train, y_train)
    
    for m_name, model in models.items():
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test)
        results[(s_name, m_name)] = accuracy_score(y_test, y_pred)

# -------------------- Results Table --------------------
results_df = pd.DataFrame.from_dict(results, orient="index", columns=["Accuracy"])
results_df = results_df.reset_index()
results_df[["Sampling", "Model"]] = pd.DataFrame(results_df["index"].tolist())
results_df = results_df.drop(columns=["index"])

print("\nFinal Accuracy Comparison:")
print(results_df.round(3))


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Final Accuracy Comparison:
    Accuracy     Sampling         Model
0      0.918          ROS        LogReg
1      0.987          ROS  DecisionTree
2      0.991          ROS  RandomForest
3      0.953          ROS    NaiveBayes
4      0.875          ROS           SVM
5      0.935        SMOTE        LogReg
6      0.983        SMOTE  DecisionTree
7      0.991        SMOTE  RandomForest
8      0.953        SMOTE    NaiveBayes
9      0.435        SMOTE           SVM
10     0.578          RUS        LogReg
11     0.388          RUS  DecisionTree
12     0.698          RUS  RandomForest
13     0.621          RUS    NaiveBayes
14     0.746          RUS           SVM
15     0.935    SMOTE_ENN        LogReg
16     0.858    SMOTE_ENN  DecisionTree
17     0.987    SMOTE_ENN  RandomForest
18     0.944    SMOTE_ENN    NaiveBayes
19     0.362    SMOTE_ENN           SVM
20     0.927  SMOTE_TOMEK        LogReg
21     0.983  SMOTE_TOMEK  DecisionTree
22     0.991  SMOTE_TOMEK  RandomForest
23     0.953