In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

df = pd.read_csv(r"C:\Users\VEDANT SAHA\OneDrive\Desktop\Kaggle\Data\Creditcard_data.csv")

#balance the dataset
X = df.drop(columns=["Class"])
y = df["Class"]

# oversampling
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

#defining sampling methods
def simple_random_sampling(X, y, size=0.2):
    """Simple Random Sampling."""
    while True:
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=size, random_state=42)
        if len(np.unique(y_sample)) > 1:
            return X_sample, y_sample

def systematic_sampling(X, y, size=0.2):
    """Systematic Sampling."""
    n = len(X)
    step = max(1, int(1 / size))  # Ensure step is valid
    indices = np.arange(0, n, step)[:int(size * n)]
    X_sample = X.iloc[indices]
    y_sample = y.iloc[indices]
    if len(np.unique(y_sample)) > 1:
        return X_sample, y_sample
    else:
        raise ValueError("Systematic sampling resulted in a single-class sample. Adjust parameters.")

def stratified_sampling(X, y, size=0.2):
    """Stratified Sampling."""
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=size, stratify=y, random_state=42)
    return X_sample, y_sample

def cluster_sampling(X, y, n_clusters=5):
    """Cluster Sampling."""
    while True:
        clusters = pd.qcut(X.index, n_clusters, labels=False)  
        chosen_cluster = np.random.choice(range(n_clusters))
        indices = X.index[clusters == chosen_cluster]
        X_sample = X.loc[indices]
        y_sample = y.loc[indices]
        if len(np.unique(y_sample)) > 1:
            return X_sample, y_sample

def cross_validation_sampling(X, y, folds=5):
    """Cross-Validation (returns a generator)."""
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    return skf.split(X, y)

#sampling methods taught in class
sampling_methods = {
    "Simple Random": simple_random_sampling,
    "Systematic": systematic_sampling,
    "Stratified": stratified_sampling,
    "Cluster": cluster_sampling,
    "Cross-Validation": cross_validation_sampling,
}

#5 different models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
}

#Evaluate Models with Sampling Techniques
def evaluate_models_with_sampling(X, y, sampling_methods, models):
    results = {}

    for method_name, sampler in sampling_methods.items():
        results[method_name] = {}

        if method_name == "Cross-Validation":
            # Handle cross-validation separately
            for model_name, model in models.items():
                accuracies = []
                skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                for train_idx, test_idx in skf.split(X, y):
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                    
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    accuracies.append(accuracy_score(y_test, y_pred))
                
                results[method_name][model_name] = np.mean(accuracies)
        else:
            # Handle other sampling methods
            X_sample, y_sample = sampler(X, y)
            for model_name, model in models.items():
                # Split sampled data into train-test sets
                X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)
                
                # Train and evaluate model
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                
                results[method_name][model_name] = accuracy
                
    return results

results = evaluate_models_with_sampling(X_balanced, y_balanced, sampling_methods, models)

#Identify the Best Sampling Method for Each Model
final_results = {}
for method, method_results in results.items():
    for model, accuracy in method_results.items():
        if model not in final_results or final_results[model]['accuracy'] < accuracy:
            final_results[model] = {'sampling_method': method, 'accuracy': accuracy}

# Display Results
print("\nBest Sampling Technique for Each Model:")
for model, result in final_results.items():
    print(f"{model}: {result['sampling_method']} with Accuracy = {result['accuracy']:.2f}")

# Step 7: Save Results to CSV
df_results = pd.DataFrame(results)
df_results.to_csv("SamplingResults.csv", index=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Best Sampling Technique for Each Model:
Logistic Regression: Cluster with Accuracy = 1.00
Decision Tree: Cluster with Accuracy = 0.99
Random Forest: Cluster with Accuracy = 1.00
SVM: Cluster with Accuracy = 1.00
KNN: Cluster with Accuracy = 1.00
