In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, confusion_matrix

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
import xgboost as xgb

# Loading and Preprocessing the dataset

In [None]:
df = pd.read_csv('chargeback_dataset.csv')
df.head()

In [None]:
# Remove * from card numbers
df['Card Number'] = df['Card Number'].str.replace('*', '')

# Drop the serial numbers column
df.drop(df.columns[0], axis=1, inplace=True)

# Convert 'Yes' and 'No' class labels to 1 and 0 respectively
df['CBK'] = df['CBK'].map({'Yes': 1, 'No': 0})

In [None]:
# Convert date string to Pandas datetime
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Preprocessing for time of transaction

# Extract the earliest date
mindate = df['Date'].min()

# Create a new feature for time of transaction and drop the day column
df['Timestamp in Seconds'] = (df['Date'] - mindate).dt.total_seconds()

In [None]:
df.info()

# Visualizations

In [None]:
# Amount of fraudulent and legitimate transactions
# Step 1: Count the occurrences of each category in the CBK column
cbk_counts = df['CBK'].value_counts()

# Step 2: Plot the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    cbk_counts, 
    labels=cbk_counts.index,  # Categories as labels
    autopct='%1.3f%%',  # Show percentages with 3 decimal
    startangle=140  # Start angle for a better layout
)

# Step 3: Add a legend
plt.legend(title="Categories", labels=cbk_counts.index, loc="upper right")

# Step 4: Display the plot
plt.title("CBK Distribution")
plt.show()

In [None]:
# Amount vs chargeback fraud
df.plot(kind='scatter', x='Amount', y='CBK', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

# Data Split and Oversampling

In [None]:
X = df[['Card Number', 'Amount', 'Timestamp in Seconds']]
y = df['CBK']

In [None]:
# split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Apply SMOTE-NC to the training data
smoteNC = SMOTENC(categorical_features = [0, 2, 3], random_state=42, sampling_strategy = 0.9)
X_train, y_train = smoteNC.fit_resample(X_train, y_train)

In [None]:
print(pd.Series(y_train).value_counts())

# Model Training

Hyperparameter Tuning was done for all models, and the best model parameters only are included.

In [None]:
# K-Nearest Neighbours Classifier
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3, p=1, weights='distance', algorithm='kd_tree')
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
KNNmodel = nca_pipe.fit(X_train, y_train)

In [None]:
# Decision Tree Classifier
DTmodel = DecisionTreeClassifier(random_state=42, criterion= 'entropy', max_depth= None, min_samples_leaf= 1, min_samples_split=2)
DTmodel.fit(X_train, y_train)

In [None]:
# Random Forest Classifier
RFmodel = RandomForestClassifier(random_state=42, criterion='gini', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 100)
RFmodel.fit(X_train, y_train)

In [None]:
# AdaBoost Classifer
# base estimator is a decision stump
ABCmodel = AdaBoostClassifier(learning_rate=1.0, n_estimators=150, random_state=42)
ABCmodel.fit(X_train, y_train)

In [None]:
# XGBoost Classifier
XGBmodel = xgb.XGBClassifier(learning_rate = 0.05, max_depth = 6, min_child_weight = 1, n_estimators = 300)
XGBmodel.fit(X_train, y_train)

In [None]:
# Stacking Classifier
base_models = [
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=3, p=1, weights='distance'))
]
meta_learner = xgb.XGBClassifier(random_state=42)

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner3,
    cv=3  # Cross-validation for training base models
)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Results

In [None]:
models_dict = {'K-Nearest Neighbours': KNNmodel, 'Decision Tree': DTmodel,
               'Random Forest': RFmodel, 'XGBoost': XGBmodel,
               'AdaBoost': ABCmodel, 'Stacking': stacking_clf}

In [None]:
def evaluate_models(models_dict, X_test, y_test):
    """
    Evaluate multiple classifiers on test data and print performance metrics.

    Parameters:
    - models_dict (dict): Dictionary with model names as keys and trained models as values.
    - X_test (array-like): Test features.
    - y_test (array-like): True labels for the test set.
    """
    for model_name, model in models_dict.items():
        print(f"\nModel: {model_name}")
        try:
            # Predict probabilities and labels
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            # Metrics
            print("Confusion Matrix:")
            print(confusion_matrix(y_test, y_pred))

            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else  roc_auc = roc_auc_score(y_test, y_pred)

            print(f"\nAccuracy: {acc:.4f}")
            print(f"Precision: {prec:.4f}")
            print(f"Recall: {rec:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print(f"ROC AUC Score: {roc_auc:.4f}")
        
        except Exception as e:
            print(f"An error occurred while evaluating {model_name}: {e}")

In [None]:
evaluate_models(models_dict, X_test, y_test)