1. Load the Dataset

In [8]:
import pandas as pd
df = pd.read_csv("/Creditcard_data.csv")
df.head(), df.shape

(   Time        V1        V2        V3        V4        V5        V6        V7  \
 0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
 1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
 2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
 3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
 4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   
 
          V8        V9  ...       V21       V22       V23       V24       V25  \
 0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
 1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
 2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
 3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
 4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   
 
         V26      

2. Convert this data-set into balanced class data-set.

In [9]:
from imblearn.over_sampling import SMOTE
# Check Class Distribution Non-Fraud(0) and Fraud(1)
X = df.drop('Class', axis=1)
y = df['Class']

smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

balanced_df = pd.concat([X_bal, y_bal], axis=1)
df['Class'].value_counts()
balanced_df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,763


3. Create Five Samples

In [10]:
# Simple Random Sampling
sample_1 = balanced_df.sample(frac=0.20, random_state=1)
sample_1.shape

(305, 31)

In [11]:
# Systematic Sampling
k = len(balanced_df) // int(0.20 * len(balanced_df))
sample_2 = balanced_df.iloc[::k]
sample_2.shape

(306, 31)

In [12]:
# Stratified Sampling
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

for _, idx in split.split(balanced_df, balanced_df['Class']):
    sample_3 = balanced_df.loc[idx]

sample_3['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,153
0,153


In [13]:
# Cluster Sampling
balanced_df['cluster'] = balanced_df.index // 1000
selected_clusters = balanced_df['cluster'].sample(n=5, random_state=42)

sample_4 = balanced_df[balanced_df['cluster'].isin(selected_clusters)]
sample_4.shape

(1526, 32)

In [14]:
# multistage Sampling
clusters = balanced_df['cluster'].unique()
chosen_clusters = clusters[:5]

sample_5 = balanced_df[balanced_df['cluster'].isin(chosen_clusters)].sample(frac=0.30, random_state=42)
sample_5.shape

samples = [sample_1, sample_2, sample_3, sample_4, sample_5]

3. Apply five different sampling techniques (Sampling1, Sampling2, Sampling3, Sampling4,
Sampling5) on five different ML models (M1, M2, M3, M4 and M5)

In [15]:
# Import Required Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

In [16]:
# Models
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": GaussianNB(),
    "M5": SVC()
}

In [18]:
# Train Models on Each Sampling Techniques
results = pd.DataFrame(
    index=models.keys(),
    columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]
)

for i, sample in enumerate(samples):
    X = sample.drop("Class", axis=1)
    y = sample["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results.loc[model_name, f"Sampling{i+1}"] = round(acc, 2)

# Find Accuracy tables
results

Unnamed: 0,Sampling1,Sampling2,Sampling3,Sampling4,Sampling5
M1,96.74,80.43,95.65,93.01,89.86
M2,91.3,88.04,89.13,97.38,92.75
M3,97.83,95.65,93.48,99.56,98.55
M4,81.52,80.43,80.43,87.12,84.78
M5,96.74,91.3,96.74,98.25,96.38


4. Determine which sampling technique gives higher accuracy on which model.

In [19]:
results = results.astype(float)
best_sampling = {}

for model in results.index:
    best_method = results.loc[model].idxmax()
    best_accuracy = results.loc[model].max()
    best_sampling[model] = (best_method, best_accuracy)

for model, (method, acc) in best_sampling.items():
    print(f"{model} gives highest accuracy with {method}: {acc}%")


M1 gives highest accuracy with Sampling1: 96.74%
M2 gives highest accuracy with Sampling4: 97.38%
M3 gives highest accuracy with Sampling4: 99.56%
M4 gives highest accuracy with Sampling4: 87.12%
M5 gives highest accuracy with Sampling4: 98.25%
