# Sampling Assignment

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## 1.

In [2]:
data = pd.read_csv('/content/Creditcard_data.csv')
X = data.drop("Class",axis=1)
y = data['Class']

## 2.

In [3]:
under_sampling = RandomUnderSampler(random_state=42,replacement = True)
X_us,y_us = under_sampling.fit_resample(X,y)

print('Original dataset : ',Counter(y))
print('Resampled dataset : ',Counter(y_us))

Original dataset :  Counter({0: 763, 1: 9})
Resampled dataset :  Counter({0: 9, 1: 9})


In [6]:
over_sampling = RandomOverSampler(random_state=42)
X_balanced,y_balanced = over_sampling.fit_resample(X,y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_balanced))

Original dataset shape Counter({0: 763, 1: 9})
Resample dataset shape Counter({0: 763, 1: 763})


## 3.

In [7]:
samples = []
for i in range(5):
    X_train, _, y_train, _ = train_test_split(
        X_balanced, y_balanced,
        train_size=0.7,
        random_state=i
    )
    samples.append((X_train, y_train))

## 4.

In [8]:
sampling_methods = {
    "Sampling1": RandomOverSampler(random_state=1),
    "Sampling2": RandomUnderSampler(random_state=2),
    "Sampling3": RandomOverSampler(random_state=3),
    "Sampling4": RandomUnderSampler(random_state=3),
    "Sampling5": RandomOverSampler(random_state=5)
}

In [9]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": GaussianNB(),
    "M5": SVC()
}

In [10]:
results = pd.DataFrame(
    index=models.keys(),
    columns=sampling_methods.keys()
)

for model_name, model in models.items():
    for sample_index, (X_s, y_s) in enumerate(samples):
        sampling_name = list(sampling_methods.keys())[sample_index]
        sampler = sampling_methods[sampling_name]

        X_resampled, y_resampled = sampler.fit_resample(X_s, y_s)

        X_train, X_test, y_train, y_test = train_test_split(
            X_resampled, y_resampled,
            test_size=0.3,
            random_state=42
        )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions) * 100
        results.loc[model_name, sampling_name] = round(acc, 2)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
print("\nAccuracy Comparison Table\n")
print(results)


Accuracy Comparison Table

   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     91.64     94.03     93.15     89.38     91.93
M2     98.45     99.06     99.07     98.44     98.76
M3     99.38     100.0     100.0     100.0     100.0
M4     81.11     80.82     79.44     76.88     68.63
M5     68.42      67.3     71.96     71.25     69.88


In [12]:
print("\nBest Sampling Technique for Each Model:\n")

best_sampling_summary = {}

for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_accuracy = results.loc[model].astype(float).max()
    best_sampling_summary[model] = (best_sampling, best_accuracy)
    print(f"{model} -> {best_sampling} with Accuracy = {best_accuracy:.2f}%")


Best Sampling Technique for Each Model:

M1 -> Sampling2 with Accuracy = 94.03%
M2 -> Sampling3 with Accuracy = 99.07%
M3 -> Sampling2 with Accuracy = 100.00%
M4 -> Sampling1 with Accuracy = 81.11%
M5 -> Sampling3 with Accuracy = 71.96%
