## Sampling Assignment
### **Objective:** Analyze how different sampling techniques affect the performance of various ML models on a balanced credit card dataset.


In [1]:
# Install necessary libraries if not already installed
!pip install pandas numpy scikit-learn imbalanced-learn



In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [3]:
# Set random seed for reproducibility
np.random.seed(42)
# 1. Load the Dataset
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
try:
    data = pd.read_csv(url)
    print("Dataset loaded successfully.")
except:
    print("Failed to load URL. Please download the CSV and load it locally.")

print("Original Class Distribution:")
print(data['Class'].value_counts())

Dataset loaded successfully.
Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [4]:
# ## 2. Balance the Dataset
# Converting the imbalanced dataset into a balanced one using SMOTE (Synthetic Minority Over-sampling Technique).
# Separating features and target
X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Create a new balanced dataframe
balanced_df = pd.concat([pd.DataFrame(X_balanced, columns=X.columns), pd.Series(y_balanced, name='Class')], axis=1)

print("\nBalanced Class Distribution:")
print(balanced_df['Class'].value_counts())


Balanced Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64


### 3. Define Sampling Techniques
We will create 5 distinct samples using different formulas/methods

###**Techniques used:**
1. Simple Random Sampling
2. Stratified Sampling
3. Systematic Sampling
4. Cluster Sampling
5. Bootstrap Sampling


In [5]:
# Sample Size Calculation (using Cochran's Formula for representative size as a baseline)
# n = (Z^2 * p * (1-p)) / E^2
# Assuming 95% confidence level (Z=1.96), p=0.5, E=0.05
Z = 1.96
p = 0.5
E = 0.05
sample_size_cochran = math.ceil((Z**2 * p * (1-p)) / E**2)
print(f"Calculated Sample Size (Cochran's Formula): {sample_size_cochran}")

# --- SAMPLING FUNCTIONS ---

def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

def stratified_sampling(df, n):
    # Stratified by Class to ensure equal representation in the sample
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=42))

def systematic_sampling(df, n):
    k = len(df) // n
    start = np.random.randint(0, k)
    return df.iloc[start::k].iloc[:n]

def cluster_sampling(df, n_clusters=5):
    # Creating pseudo-clusters for demonstration since no natural clusters exist in anonymized data
    df_copy = df.copy()
    df_copy['cluster'] = np.random.randint(0, n_clusters, size=len(df))
    # Select random clusters
    selected_clusters = np.random.choice(range(n_clusters), size=2, replace=False)
    sample = df_copy[df_copy['cluster'].isin(selected_clusters)]
    return sample.drop('cluster', axis=1)

def bootstrap_sampling(df, n):
    # Sampling with replacement
    return df.sample(n=n, replace=True, random_state=42)

# Create the 5 samples
# We generally use the calculated sample size, but for Cluster we take whole clusters.
samples = {
    "Sampling1 (Simple Random)": simple_random_sampling(balanced_df, sample_size_cochran),
    "Sampling2 (Stratified)": stratified_sampling(balanced_df, sample_size_cochran),
    "Sampling3 (Systematic)": systematic_sampling(balanced_df, sample_size_cochran),
    "Sampling4 (Cluster)": cluster_sampling(balanced_df),
    "Sampling5 (Bootstrap)": bootstrap_sampling(balanced_df, sample_size_cochran)
}

Calculated Sample Size (Cochran's Formula): 385


  return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=42))


In [6]:
# 4. Define Models (M1 to M5)
models = {
    "M1 (Logistic Regression)": LogisticRegression(max_iter=1000),
    "M2 (Decision Tree)": DecisionTreeClassifier(random_state=42),
    "M3 (Random Forest)": RandomForestClassifier(random_state=42),
    "M4 (SVM)": SVC(),
    "M5 (Naive Bayes)": GaussianNB()
}

# 5. Train and Evaluate
# Applying every model on every sampling technique and recording accuracy[cite: 19, 20].
results = {}

for sample_name, sample_data in samples.items():
    # Split features and target for the current sample
    X_sample = sample_data.drop('Class', axis=1)
    y_sample = sample_data['Class']

    # Split into train and test sets (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    # Scale the data (important for Logistic Regression and SVM)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    sample_results = {}

    for model_name, model in models.items():
        # Train model
        if "Logistic" in model_name or "SVM" in model_name:
            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        sample_results[model_name] = round(acc * 100, 2)
    results[sample_name] = sample_results

In [7]:
# 6. Results and Discussion
# Convert results to DataFrame for nice display
results_df = pd.DataFrame(results)
# Transpose to match the assignment table format (Rows=Models, Cols=Samplings)
final_table = results_df
print(final_table)
# Save to CSV (Optional)
final_table.to_csv("sampling_results.csv")

                          Sampling1 (Simple Random)  Sampling2 (Stratified)  \
M1 (Logistic Regression)                      88.31                   90.91   
M2 (Decision Tree)                            94.81                   96.10   
M3 (Random Forest)                            97.40                   98.70   
M4 (SVM)                                      90.91                   98.70   
M5 (Naive Bayes)                              88.31                   85.71   

                          Sampling3 (Systematic)  Sampling4 (Cluster)  \
M1 (Logistic Regression)                   93.51                95.12   
M2 (Decision Tree)                         94.81                94.31   
M3 (Random Forest)                         98.70                98.37   
M4 (SVM)                                   94.81                96.75   
M5 (Naive Bayes)                           77.92                70.73   

                          Sampling5 (Bootstrap)  
M1 (Logistic Regression)            