In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours


In [2]:
data = pd.read_csv("/content/Creditcard_data.csv")
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [3]:
print(data['Class'].value_counts())


Class
0    763
1      9
Name: count, dtype: int64


In [4]:
def balance_dataset(method, X, y):
    if method == 'oversampling':
        smote = SMOTE()
        X_bal, y_bal = smote.fit_resample(X, y)
    elif method == 'undersampling':
        undersample = RandomUnderSampler()
        X_bal, y_bal = undersample.fit_resample(X, y)
    elif method == 'smote_tomek':
        smote_tomek = SMOTETomek()
        X_bal, y_bal = smote_tomek.fit_resample(X, y)
    elif method == 'tomek_links':
        tomek = TomekLinks()
        X_bal, y_bal = tomek.fit_resample(X, y)
    elif method == 'enn':
        enn = EditedNearestNeighbours()
        X_bal, y_bal = enn.fit_resample(X, y)
    else:
        raise ValueError("Invalid balancing method")

    return X_bal, y_bal


In [5]:
X = data.drop(columns=['Class'])
y = data['Class']

In [6]:
methods = ['oversampling', 'undersampling', 'smote_tomek', 'tomek_links', 'enn']
samples = []


In [7]:
for method in methods:
    X_bal, y_bal = balance_dataset(method, X, y)
    samples.append((X_bal, y_bal))

print("Data balancing completed.")


Data balancing completed.


In [9]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = []

In [10]:
for i, (X_sample, y_sample) in enumerate(samples):
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({
            'Sampling Technique': methods[i],
            'Model': model_name,
            'Accuracy': accuracy
        })

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
results_df = pd.DataFrame(results)
print("\nResults:\n")
print(results_df)


Results:

   Sampling Technique                Model  Accuracy
0        oversampling  Logistic Regression  0.925764
1        oversampling        Decision Tree  0.967249
2        oversampling        Random Forest  0.989083
3        oversampling                  SVM  0.707424
4        oversampling    Gradient Boosting  0.982533
5       undersampling  Logistic Regression  0.333333
6       undersampling        Decision Tree  0.166667
7       undersampling        Random Forest  0.166667
8       undersampling                  SVM  0.166667
9       undersampling    Gradient Boosting  0.166667
10        smote_tomek  Logistic Regression  0.889640
11        smote_tomek        Decision Tree  0.975225
12        smote_tomek        Random Forest  0.997748
13        smote_tomek                  SVM  0.650901
14        smote_tomek    Gradient Boosting  0.986486
15        tomek_links  Logistic Regression  0.986957
16        tomek_links        Decision Tree  0.986957
17        tomek_links        Random

In [13]:
results_df.to_csv("sampling_results.csv", index=False)
print("Results saved to 'sampling_results.csv'.")

Results saved to 'sampling_results.csv'.
