In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/Creditcard_data.csv")
print(data.head())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [3]:
print(data['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [4]:
from imblearn.over_sampling import SMOTE

X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

balanced_data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)
print(balanced_data['Class'].value_counts())


Class
0    763
1    763
Name: count, dtype: int64


In [5]:
sample1 = balanced_data.sample(frac=0.2, random_state=1)
sample2 = balanced_data.sample(frac=0.2, random_state=2)
sample3 = balanced_data.sample(frac=0.2, random_state=3)
sample4 = balanced_data.sample(frac=0.2, random_state=4)
sample5 = balanced_data.sample(frac=0.2, random_state=5)
sample1.to_csv("sample1.csv", index=False)
sample2.to_csv("sample2.csv", index=False)
sample3.to_csv("sample3.csv", index=False)
sample4.to_csv("sample4.csv", index=False)
sample5.to_csv("sample5.csv", index=False)

In [6]:
simple_random_sample = balanced_data.sample(n=1000, random_state=42)

In [7]:
from sklearn.model_selection import train_test_split

stratified_sample, _ = train_test_split(balanced_data, test_size=0.8, stratify=balanced_data['Class'], random_state=42)


In [8]:
interval = len(balanced_data) // 1000
systematic_sample = balanced_data.iloc[::interval]


In [14]:
import random

if 'Age' in balanced_data.columns:
    balanced_data['Age_cluster'] = balanced_data['Age'] // 10
else:
    balanced_data['Age_cluster'] = random.choices(range(1, 4), k=len(balanced_data))

unique_clusters = balanced_data['Age_cluster'].unique()
print(f"Unique clusters: {unique_clusters}")

k = min(3, len(unique_clusters))

selected_clusters = random.sample(list(unique_clusters), k=k)

cluster_sample = balanced_data[balanced_data['Age_cluster'].isin(selected_clusters)]


Unique clusters: [1 3 2]


In [16]:
convenience_sample = balanced_data.head(1000)

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

sampling_techniques = {
    "Sampling1": pd.read_csv("/content/sample1.csv"),
    "Sampling2": pd.read_csv("/content/sample2.csv"),
    "Sampling3": pd.read_csv("/content/sample3.csv"),
    "Sampling4": pd.read_csv("/content/sample4.csv"),
    "Sampling5": pd.read_csv("/content/sample5.csv"),
}

models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": SVC(),
    "M5": KNeighborsClassifier()
}

accuracy_matrix = pd.DataFrame(columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"], index=models.keys())

for sampling_name, sampled_data in sampling_techniques.items():
    X = sampled_data.drop("Class", axis=1)
    y = sampled_data["Class"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_matrix.loc[model_name, sampling_name] = accuracy

print("Accuracy Matrix:")
print(accuracy_matrix)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Matrix:
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1  0.983607  0.852459  0.885246  0.934426  0.918033
M2  0.868852  0.967213   0.95082  0.967213   0.95082
M3  0.967213       1.0  0.983607  0.983607  0.983607
M4  0.737705  0.590164  0.721311  0.672131  0.688525
M5  0.737705  0.721311  0.754098  0.737705  0.672131


In [23]:
best_combinations = accuracy_matrix.idxmax()
print("Best Sampling Technique for Each Model:")
print(best_combinations)

Best Sampling Technique for Each Model:
Sampling1    M1
Sampling2    M3
Sampling3    M3
Sampling4    M3
Sampling5    M3
dtype: object
