In [127]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import warnings

warnings.filterwarnings('ignore')


In [128]:
# Load the dataset
d = pd.read_csv('/Users/mac/Documents/Sampling/Creditcard_data.csv')

# Separate features and target
X = d.drop(columns=["Class"])
y = d["Class"]

# Apply SMOTE + Tomek Links for balancing
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Standardize the features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)


In [129]:
# Sampling techniques
def simple_random_sampling(X, y, size):
    data = pd.DataFrame(X)
    data['Class'] = y
    return data.sample(n=size, random_state=42)

def stratified_sampling(X, y, size):
    data = pd.DataFrame(X)
    data['Class'] = y
    return resample(data, n_samples=size, stratify=y, random_state=42)

def bootstrap_sampling(X, y, size):
    data = pd.DataFrame(X)
    data['Class'] = y
    return data.sample(n=size, replace=True, random_state=42)

def systematic_sampling(X, y, size):
    data = pd.DataFrame(X)
    data['Class'] = y
    step = len(data) // size
    return data.iloc[::step]

def cluster_sampling(X, y, size):
    data = pd.DataFrame(X)
    data['Class'] = y
    kmeans = KMeans(n_clusters=5, random_state=42)
    data['Cluster'] = kmeans.fit_predict(X)
    cluster_samples = []
    for cluster_label in data['Cluster'].unique():
        cluster_data = data[data['Cluster'] == cluster_label]
        sample_count = min(len(cluster_data), size // 5)
        cluster_samples.append(cluster_data.sample(n=sample_count, random_state=42))
    sampled_data = pd.concat(cluster_samples).drop(columns=['Cluster'])
    return sampled_data


In [133]:
# Define models with tuned parameters
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "Support Vector Machine": SVC(kernel='rbf', C=10, probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
}


sampling_techniques = {
    "Simple Random Sampling": simple_random_sampling,
    "Stratified Sampling": stratified_sampling,
    "Bootstrap Sampling": bootstrap_sampling,
    "Systematic Sampling": systematic_sampling,
    "Cluster Sampling": cluster_sampling,
}


In [138]:
# Determine sample size
Z = 1.96  # Z-value for 95% confidence
p = 0.5   # Estimated proportion
E = 0.05  # Margin of error
N = len(X_resampled)

sample_size = int((Z**2 * p * (1 - p) * N) / (E**2 * (N - 1) + Z**2 * p * (1 - p)))
print(f"Calculated Sample Size: {sample_size}")

# Create samples
samples = {}
for technique_name, sampling_func in sampling_techniques.items():
    samples[technique_name] = sampling_func(X_resampled, y_resampled, sample_size)

# Train and evaluate models
accuracy_matrix = {}
for sample_name, sample_data in samples.items():
    X_smpl = sample_data.drop(columns=["Class"])
    y_smpl = sample_data["Class"]

    X_train, X_test, y_train, y_test = train_test_split(X_smpl, y_smpl, test_size=0.3, random_state=42)

    accuracy_matrix[sample_name] = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_matrix[sample_name][model_name] = accuracy

# Display results
accuracy_df = pd.DataFrame(accuracy_matrix).T
print("Accuracy Matrix:")
print(accuracy_df)

# Find the best sampling technique and model
best_technique = accuracy_df.max(axis=1).idxmax()
best_model = accuracy_df.loc[best_technique].idxmax()
best_score = accuracy_df.loc[best_technique, best_model]


# Find best sampling technique for each model
best_combo = accuracy_df.idxmax()
print("\nBest Sampling Technique for Each Model:")
print(best_combo)


Calculated Sample Size: 305
Accuracy Matrix:
                        Logistic Regression  Decision Tree  \
Simple Random Sampling             0.815217       0.956522   
Stratified Sampling                0.923913       0.934783   
Bootstrap Sampling                 0.913043       0.956522   
Systematic Sampling                0.848214       0.973214   
Cluster Sampling                   0.935897       0.935897   

                        Support Vector Machine  K-Nearest Neighbors  \
Simple Random Sampling                0.956522             0.869565   
Stratified Sampling                   0.934783             0.804348   
Bootstrap Sampling                    0.956522             0.869565   
Systematic Sampling                   0.982143             0.910714   
Cluster Sampling                      0.961538             0.884615   

                        Random Forest  
Simple Random Sampling       0.989130  
Stratified Sampling          0.967391  
Bootstrap Sampling           0.9891

In [137]:
print(f"\nBest Sampling Technique: {best_technique}")
print(f"Best Model: {best_model}")

print(f"Best Accuracy: {best_score:.2f}")


accuracy_df.to_csv("sampling_results.csv")
     


Best Sampling Technique: Systematic Sampling
Best Model: Random Forest
Best Accuracy: 0.99
