In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import math

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

print(data['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [2]:
X = data.drop('Class', axis=1)
y = data['Class']

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

print(balanced_df['Class'].value_counts())

Class
0    763
1    763
Name: count, dtype: int64


In [5]:
N = len(balanced_df)
e = 0.05
n = int(N / (1 + N * (e**2)))

def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

def systematic_sampling(df, n):
    step = len(df) // n
    indices = np.arange(0, len(df), step)[:n]
    return df.iloc[indices]

def stratified_sampling(df, n):
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(min(len(x), int(n/2)), random_state=42))

def cluster_sampling(df, n):
    num_clusters = 10
    df_copy = df.copy()
    df_copy['cluster_id'] = pd.cut(df['V1'], bins=num_clusters, labels=False)

    while True:
        selected_cluster = np.random.choice(num_clusters)
        sample = df_copy[df_copy['cluster_id'] == selected_cluster]
        if len(sample['Class'].unique()) > 1:
            return sample.drop('cluster_id', axis=1)

def bootstrap_sampling(df, n):
    return df.sample(n=n, replace=True, random_state=42)

samples = {
    "Simple Random": simple_random_sampling(balanced_df, n),
    "Systematic": systematic_sampling(balanced_df, n),
    "Stratified": stratified_sampling(balanced_df, n),
    "Cluster": cluster_sampling(balanced_df, n),
    "Bootstrap": bootstrap_sampling(balanced_df, n)
}

  return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(min(len(x), int(n/2)), random_state=42))


In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

results = {}

for sample_name, sample_df in samples.items():
    X_sample = sample_df.drop('Class', axis=1)
    y_sample = sample_df['Class']

    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results[sample_name] = {}

    for model_name, model in models.items():
        if model_name in ["SVM", "KNN", "Logistic Regression"]:
            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

        accuracy = accuracy_score(y_test, preds)
        results[sample_name][model_name] = round(accuracy * 100, 2)

results_df = pd.DataFrame(results)
print(results_df)

print(results_df.idxmax(axis=1))

                     Simple Random  Systematic  Stratified  Cluster  Bootstrap
Logistic Regression          84.38       90.62       90.62   100.00      93.75
Decision Tree                90.62       96.88       96.88    97.56      96.88
Random Forest                98.44       98.44      100.00   100.00     100.00
SVM                          90.62       95.31       98.44   100.00     100.00
KNN                          90.62       87.50       92.19    92.68      96.88
Logistic Regression       Cluster
Decision Tree             Cluster
Random Forest          Stratified
SVM                       Cluster
KNN                     Bootstrap
dtype: object
