In [6]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Loading the dataset
data = pd.read_csv("/content/Creditcard_data.csv")
print(data.head())
print(data.info())
print(data['Class'].value_counts())  # Check class imbalance

#Applying SMOTE for balancing the dataset
X = data.drop(columns='Class')
y = data['Class']

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

print("Balanced Class Distribution:")
print(y_balanced.value_counts())


balanced_data = pd.concat([X_balanced, pd.Series(y_balanced, name='Class')], axis=1)

#Generating five different samples
sample1 = balanced_data.sample(n=200, random_state=1)  # Random sampling
sample2 = balanced_data.sample(n=200, random_state=2)  # Random sampling with a different seed
_, sample3 = train_test_split(balanced_data, test_size=0.25, stratify=balanced_data['Class'], random_state=3)  # Stratified sampling
sample4 = balanced_data.head(200)  # First 200 rows as a sample
sample5 = balanced_data.tail(200)  # Last 200 rows as a sample


samples = [sample1, sample2, sample3, sample4, sample5]
for i, sample in enumerate(samples, 1):
    sample.to_csv(f"sample{i}.csv", index=False)

# Defining models to use
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
}

# Applying models to each sample
for i, sample in enumerate(samples, 1):
    print(f"Results for Sample {i}")
    X = sample.drop(columns='Class')
    y = sample['Class']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    if len(y_train.unique()) < 2 or len(y_test.unique()) < 2:
        print(f"Skipping Sample {i}: Not enough class representation in train/test split.")
        continue

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name}: Accuracy = {accuracy:.2f}")
    print("-" * 55)


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Parameters: { "use_label_encoder" } are not used.



XGBoost: Accuracy = 0.93
KNN: Accuracy = 0.77
Logistic Regression: Accuracy = 0.87
----------------------------------------
Results for Sample 2
Random Forest: Accuracy = 0.95
Naive Bayes: Accuracy = 0.83


Parameters: { "use_label_encoder" } are not used.



XGBoost: Accuracy = 0.92
KNN: Accuracy = 0.73
Logistic Regression: Accuracy = 0.83
----------------------------------------
Results for Sample 3
Random Forest: Accuracy = 0.98
Naive Bayes: Accuracy = 0.86


Parameters: { "use_label_encoder" } are not used.



XGBoost: Accuracy = 0.96
KNN: Accuracy = 0.75
Logistic Regression: Accuracy = 0.90
----------------------------------------
Results for Sample 4
Random Forest: Accuracy = 0.98
Naive Bayes: Accuracy = 0.98
XGBoost: Accuracy = 0.98
KNN: Accuracy = 0.98
Logistic Regression: Accuracy = 0.98
----------------------------------------
Results for Sample 5
Skipping Sample 5: Not enough class representation in train/test split.


Parameters: { "use_label_encoder" } are not used.



In [5]:
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier  # Ensure that XGBoost is imported as well
