In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import numpy as np

In [None]:
data = pd.read_csv("/content/Creditcard_data.csv")

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
oversampler = RandomOverSampler(random_state=42)
X_balanced, y_balanced = oversampler.fit_resample(X, y)


Number of instances after balancing:
0    763
1    763
Name: Class, dtype: int64


In [None]:
num_samples = 5

In [None]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'k-NN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

In [None]:
sampling_techniques = {
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=num_samples, test_size=0.2, random_state=42),
    'Simple Sampling': None,  # Will be handled separately
    'Cluster Sampling': KMeans(n_clusters=num_samples, random_state=42).fit(X_balanced),
    'Systematic Sampling': None,  # Will be handled separately
    'Cross-Validation Sampling': StratifiedKFold(n_splits=num_samples, shuffle=True, random_state=42)
}



In [None]:
results = pd.DataFrame(index=models.keys(), columns=sampling_techniques.keys())
for model_name, model in models.items():
    for sampling_name, sampling in sampling_techniques.items():
        accuracies = []
        if 'Stratified' in sampling_name or 'Cross-Validation' in sampling_name:
            # Use StratifiedShuffleSplit or StratifiedKFold
            for train_index, test_index in sampling.split(X_balanced, y_balanced):
                X_train, X_test = X_balanced.iloc[train_index], X_balanced.iloc[test_index]
                y_train, y_test = y_balanced.iloc[train_index], y_balanced.iloc[test_index]

                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
        elif 'Simple' in sampling_name:
            # Use Simple Sampling (train_test_split)
            for _ in range(num_samples):
                X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
        elif 'Systematic' in sampling_name:
            # Use Systematic Sampling (implement systematic_sampling function)
            for _ in range(num_samples):
                systematic_indices = np.arange(0, len(X_balanced), num_samples)
                X_train, y_train = X_balanced.iloc[systematic_indices], y_balanced.iloc[systematic_indices]

                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
        else:
            # Use Cluster Sampling
            labels = sampling.labels_
            for label in np.unique(labels):
                cluster_indices = np.where(labels == label)[0]

                # Check if the cluster size is sufficient for splitting
                if len(cluster_indices) >= 2:
                    X_train, X_test, y_train, y_test = train_test_split(X_balanced.iloc[cluster_indices], y_balanced.iloc[cluster_indices], test_size=0.2, random_state=42)

                    model.fit(X_train, y_train)

                    y_pred = model.predict(X_test)

                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)

        avg_accuracy = np.mean(accuracies)
        results.at[model_name, sampling_name] = avg_accuracy

In [None]:
from tabulate import tabulate

In [None]:
print("\nResults (Average Accuracy):")
print(tabulate(results, headers='keys', tablefmt='pretty'))


Results (Average Accuracy):
+---------------------+---------------------+--------------------+--------------------+---------------------+---------------------------+
|                     | Stratified Sampling |  Simple Sampling   |  Cluster Sampling  | Systematic Sampling | Cross-Validation Sampling |
+---------------------+---------------------+--------------------+--------------------+---------------------+---------------------------+
| Logistic Regression | 0.9248366013071895  | 0.9183006535947712 | 0.9811704384724187 | 0.9306930693069309  |    0.9206835958427086     |
|    Random Forest    |         1.0         |        1.0         |        1.0         |         1.0         |            1.0            |
|         SVM         | 0.7098039215686275  | 0.6699346405228758 | 0.8172921150810732 |  0.801980198019802  |    0.7169013179042109     |
|        k-NN         | 0.9843137254901961  | 0.9869281045751634 | 0.9685961810466761 | 0.9702970297029703  |    0.9809964641594343     |
|  Gr