In [1]:
pip install pandas scikit-learn imbalanced-learn




In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [3]:
data = pd.read_csv("Creditcard_data.csv")

print(data.head())
print(data['Class'].value_counts())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [4]:
X = data.drop("Class", axis=1)
y = data["Class"]


In [5]:
oversampler = RandomOverSampler(random_state=42)
X_balanced, y_balanced = oversampler.fit_resample(X, y)

print("Balanced class distribution:")
print(y_balanced.value_counts())


Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [6]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": KNeighborsClassifier(),
    "M5": SVC()
}


In [7]:
results = pd.DataFrame(index=models.keys(),
                       columns=["SimpleRandom", "Stratified", "Bootstrap", "Oversampling", "Undersampling"])


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced,
    test_size=0.3,
    random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred) * 100
    results.loc[name, "SimpleRandom"] = round(acc, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced,
    test_size=0.3,
    stratify=y_balanced,
    random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred) * 100
    results.loc[name, "Stratified"] = round(acc, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
boot_index = np.random.choice(
    len(X_balanced),
    size=len(X_balanced),
    replace=True
)

X_boot = X_balanced.iloc[boot_index]
y_boot = y_balanced.iloc[boot_index]

X_train, X_test, y_train, y_test = train_test_split(
    X_boot, y_boot,
    test_size=0.3,
    random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred) * 100
    results.loc[name, "Bootstrap"] = round(acc, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
over = RandomOverSampler(random_state=42)
X_over, y_over = over.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_over, y_over,
    test_size=0.3,
    random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred) * 100
    results.loc[name, "Oversampling"] = round(acc, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
under = RandomUnderSampler(random_state=42)
X_under, y_under = under.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_under, y_under,
    test_size=0.3,
    random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred) * 100
    results.loc[name, "Undersampling"] = round(acc, 2)


In [13]:
print("\nFINAL ACCURACY COMPARISON TABLE:\n")
print(results)



FINAL ACCURACY COMPARISON TABLE:

   SimpleRandom Stratified Bootstrap Oversampling Undersampling
M1         91.7       91.7     92.58         91.7         66.67
M2        98.91      99.13     99.13        98.69         66.67
M3        99.56      100.0     100.0        100.0          50.0
M4        98.47      98.91     98.03        98.47         16.67
M5        68.56      75.11     69.87        68.56         16.67


In [14]:
print("\nBEST SAMPLING METHOD FOR EACH MODEL:\n")

for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_accuracy = results.loc[model].astype(float).max()

    print(model, "->", best_sampling, ":", best_accuracy, "%")



BEST SAMPLING METHOD FOR EACH MODEL:

M1 -> Bootstrap : 92.58 %
M2 -> Stratified : 99.13 %
M3 -> Stratified : 100.0 %
M4 -> Stratified : 98.91 %
M5 -> Stratified : 75.11 %
