# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
url = 'https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv'
credit_card_data = pd.read_csv(url)

# EDA

In [None]:
credit_card_data.keys()

In [None]:
credit_card_data.head()

In [None]:
credit_card_data.info()

In [None]:
credit_card_data.isnull().sum()

In [None]:
credit_card_data['Class'].value_counts()

# Data Pre-Processing

In [None]:
credit_card_data = credit_card_data.drop("Time", axis=1)

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

In [None]:
credit_card_data['std_Amount'] = scaler.fit_transform(credit_card_data['Amount'].values.reshape (-1,1))

credit_card_data = credit_card_data.drop("Amount", axis=1)

In [None]:
sns.countplot(x="Class", data=credit_card_data)

# UnderSampling

In [None]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy=0.5)

In [None]:
cols = credit_card_data.columns.tolist()
cols = [c for c in cols if c not in ["Class"]]
target = "Class"

In [None]:
X = credit_card_data[cols]
Y = credit_card_data[target]

X_under, Y_under = undersample.fit_resample(X, Y)

In [None]:
from pandas import DataFrame
test = pd.DataFrame(Y_under, columns = ['Class'])

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(13,4.5))
sns.countplot(x="Class", data=credit_card_data, ax=axs[0])
sns.countplot(x="Class", data=test, ax=axs[1])

fig.suptitle("Class repartition before and after undersampling")
a1=fig.axes[0]
a1.set_title("Before")
a2=fig.axes[1]
a2.set_title("After")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_under, Y_under, test_size=0.2, random_state=1)

# Model Selection

In [None]:
pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=42)

models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

# Classification Model using XGB

In [None]:
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [None]:
model.fit(X_train, y_train)

# Model Evaluation

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Model Validation

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

print("\n🔁 10-Fold Cross-Validation Results:")
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

# Model Performance Visualization

In [None]:
matrix_svm = confusion_matrix(y_test, y_pred)
cm_svm = pd.DataFrame(matrix_svm, index=['not_fraud', 'fraud'], columns=['not_fraud', 'fraud'])

sns.heatmap(cm_svm, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix SVM"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_prob):.4f}')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_prob)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', color='black', label='No Skill')
plt.plot(recall, precision, color='orange', label='SVM')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.legend()
plt.show()

In [None]:
feature_importance = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

plt.figure(figsize=(10,6))
feature_importance.head(15).plot(kind='barh')
plt.title('Top 15 Important Features')
plt.show()