Import lib & load data

In [9]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [10]:
# Load the data
df = pd.read_csv('Creditcard_data.csv')

print(df.shape)
print(df.columns)
print(df.head())

(772, 31)
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170  

In [11]:
# Inspect the initial class distribution
print("Original Class Distribution:")
print(df['Class'].value_counts())

Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


Balance the Dataset (SMOTE)

In [6]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new balanced DataFrame
balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), 
                         pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

print("\nBalanced Class Distribution:")
print(balanced_df['Class'].value_counts())


Balanced Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64


Define Sampling Techniques

In [13]:
# Calculate Sample Size using Cochran's Formula
# Z = 1.96 (95% confidence), p = 0.5, e = 0.05
n_0 = (1.96**2 * 0.5 * 0.5) / (0.05**2)
N = len(balanced_df)
sample_size = math.ceil(n_0 / (1 + (n_0 - 1) / N))
print(f"Calculated Sample Size: {sample_size}")

# 1. Simple Random Sampling
def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

# 2. Stratified Sampling
def stratified_sampling(df, n):
    # Since data is balanced (50/50), we take n/2 from Class 0 and n/2 from Class 1
    class_0 = df[df['Class'] == 0].sample(n=n//2, random_state=42)
    class_1 = df[df['Class'] == 1].sample(n=n//2, random_state=42)
    return pd.concat([class_0, class_1])

# 3. Systematic Sampling
def systematic_sampling(df, n):
    k = len(df) // n
    start = np.random.randint(0, k)
    indices = np.arange(start, len(df), k)[:n]
    return df.iloc[indices]

# 4. Cluster Sampling
def cluster_sampling(df, n):
    # Create 20 synthetic clusters
    num_clusters = 20
    df_copy = df.copy()
    df_copy['cluster'] = np.random.randint(0, num_clusters, size=len(df))
    
    # Pick random clusters
    selected_clusters = []
    current_size = 0
    cluster_ids = list(range(num_clusters))
    np.random.shuffle(cluster_ids)
    
    for c_id in cluster_ids:
        cluster_data = df_copy[df_copy['cluster'] == c_id]
        selected_clusters.append(cluster_data)
        current_size += len(cluster_data)
        if current_size >= n:
            break
            
    sample = pd.concat(selected_clusters)
    return sample.drop('cluster', axis=1).sample(n=n, random_state=42)

# 5. Bootstrap Sampling
def bootstrap_sampling(df, n):
    return df.sample(n=n, replace=True, random_state=42)

# Dictionary of functions
sampling_techniques = {
    "Simple Random": simple_random_sampling,
    "Stratified": stratified_sampling,
    "Systematic": systematic_sampling,
    "Cluster": cluster_sampling,
    "Bootstrap": bootstrap_sampling
}

Calculated Sample Size: 308


Train and Evaluate Models

In [14]:
# define 5 models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000), 
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

print("Starting Model Evaluation...")

for technique_name, sampler in sampling_techniques.items():
    print(f"Applying {technique_name}...")
    
    # Get Sample
    sample_df = sampler(balanced_df, sample_size)
    
    X_sample = sample_df.drop('Class', axis=1)
    y_sample = sample_df['Class']
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    technique_results = {}
    
    for model_name, model in models.items():
        # Train
        model.fit(X_train_scaled, y_train)
        
        # Predict
        y_pred = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        
        technique_results[model_name] = round(acc * 100, 2)
    
    results[technique_name] = technique_results

print("\nEvaluation Complete.")

Starting Model Evaluation...
Applying Simple Random...
Applying Stratified...
Applying Systematic...
Applying Cluster...
Applying Bootstrap...

Evaluation Complete.


Final Results

In [18]:
# Create Pivot Table 
results_df = pd.DataFrame(results).T
print("\nFinal Accuracy Score (%)")
display(results_df)

# Save to CSV
results_df.to_csv("sampling_results.csv")


Final Accuracy Score (%)


Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,SVM,Gradient Boosting
Simple Random,91.94,90.32,95.16,90.32,93.55
Stratified,91.94,93.55,98.39,96.77,96.77
Systematic,95.16,93.55,100.0,100.0,95.16
Cluster,90.32,95.16,98.39,91.94,96.77
Bootstrap,95.16,100.0,100.0,100.0,100.0


In [19]:
# Identify the best sampling technique
avg_performance = results_df.mean(axis=1)
print("\nBest Sampling Technique based on Average Accuracy:")
print(avg_performance.idxmax())


Best Sampling Technique based on Average Accuracy:
Bootstrap
