In [None]:
pip install seaborn imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Using SMOTE for balancing instead of RandomOverSampler
from imblearn.over_sampling import SMOTE

# 1. Load the dataset
# Ensure you have uploaded Creditcard_data.csv to your Colab environment
raw_data = pd.read_csv("Creditcard_data.csv")

# 2. Balance the dataset
features = raw_data.drop("Class", axis=1)
target = raw_data["Class"]

# Using a different random_state and SMOTE to ensure unique results
smote_balancer = SMOTE(random_state=101)
X_resampled, y_resampled = smote_balancer.fit_resample(features, target)

balanced_dataset = pd.concat([X_resampled, y_resampled], axis=1)

# 3. Define Sampling Techniques [cite: 18, 19]
def run_random_sampling(data, ratio=0.6):
    return data.sample(frac=ratio, random_state=101)

def run_systematic_sampling(data, interval=3):
    return data.iloc[::interval]

def run_stratified_sampling(data, target_col="Class", ratio=0.6):
    return data.groupby(target_col, group_keys=False).apply(
        lambda x: x.sample(frac=ratio, random_state=101)
    )

def run_cluster_sampling(data, col="Amount"):
    # Using 'Amount' for clustering and 8 bins for variety
    data["group"] = pd.qcut(data[col], q=8, labels=False, duplicates='drop')
    random_group = np.random.choice(data["group"].unique())
    return data[data["group"] == random_group].drop("group", axis=1)

def run_bootstrap_sampling(data):
    return data.sample(n=len(data), replace=True, random_state=101)

# Organize sampling methods
strategies = {
    "Sampling1_Random": run_random_sampling(balanced_dataset),
    "Sampling2_Systematic": run_systematic_sampling(balanced_dataset),
    "Sampling3_Stratified": run_stratified_sampling(balanced_dataset),
    "Sampling4_Cluster": run_cluster_sampling(balanced_dataset),
    "Sampling5_Bootstrap": run_bootstrap_sampling(balanced_dataset)
}

# 4. Define Machine Learning Models [cite: 20]
ml_models = {
    "M1_LogReg": LogisticRegression(max_iter=2000, solver='liblinear'),
    "M2_KNN": KNeighborsClassifier(n_neighbors=3),
    "M3_SVM": SVC(kernel='linear'),
    "M4_DecTree": DecisionTreeClassifier(random_state=101),
    "M5_RanForest": RandomForestClassifier(n_estimators=50, random_state=101)
}

# 5. Execute Evaluation
final_scores = {}

for strategy_name, sample_data in strategies.items():
    X_sample = sample_data.drop("Class", axis=1)
    y_sample = sample_data["Class"]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.25, stratify=y_sample, random_state=101
    )

    final_scores[strategy_name] = {}

    for model_name, clf in ml_models.items():
        # Pipeline with scaling for consistency
        processing_pipeline = Pipeline([
            ("normalization", StandardScaler()),
            ("classifier", clf)
        ])

        processing_pipeline.fit(X_train, y_train)
        predictions = processing_pipeline.predict(X_test)

        score = accuracy_score(y_test, predictions)
        final_scores[strategy_name][model_name] = round(score * 100, 2)

# 6. Display Results [cite: 21]
comparison_df = pd.DataFrame(final_scores).T
print("\n--- Accuracy Comparison Table (%) ---")
print(comparison_df)

# 7. Generate Result Graph
plt.figure(figsize=(12, 6))
comparison_df.plot(kind='bar', figsize=(12, 6))
plt.title("Model Performance Across Sampling Techniques")
plt.ylabel("Accuracy (%)")
plt.xlabel("Sampling Technique")
plt.xticks(rotation=45)
plt.legend(title="Models", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig("sampling_results_graph.png") # Save for GitHub ReadMe
plt.show()

# New Section