In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import kagglehub

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    roc_auc_score,
    precision_recall_curve
)

from imblearn.over_sampling import SMOTE

In [None]:
!pip install kagglehub pandas
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import os
import kagglehub
# Download dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

dataset_files = os.listdir(path)
print("Dataset files:", dataset_files)

# Load CSV file into DataFrame
csv_file = [f for f in dataset_files if f.endswith(".csv")][0]
df = pd.read_csv(os.path.join(path, csv_file))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
duplicates = df[df.duplicated()]
print(f"Number of duplicate rows: {len(duplicates)}")

In [None]:
# Remove duplicate rows
df_cleaned = df.drop_duplicates()
print(f"Data after removing duplicates: {df_cleaned.shape[0]} rows")

In [None]:
duplicates = df_cleaned[df_cleaned.duplicated()]
print(f"Number of duplicate rows: {len(duplicates)}")

In [None]:
df_cleaned.head()

In [None]:
sns.set(style="whitegrid")

# Plot histogram of 'Amount'
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['Amount'], kde=True, color='blue', bins=50)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Class distribution
class_distribution = df_cleaned['Class'].value_counts()

plt.figure(figsize=(6, 6))
class_distribution.plot.pie(autopct='%1.1f%%', colors=['lightblue', 'salmon'], startangle=90)
plt.title('Class Distribution: Fraud (1) vs Non-Fraud (0)')
plt.ylabel('')
plt.show()

In [None]:
# Compute the correlation matrix
correlation_matrix = df_cleaned.corr()

# Plot heatmap of feature correlations
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Features')
plt.show()

In [None]:
# Plot box plot for 'Amount'
plt.figure(figsize=(8, 6))
sns.boxplot(x=df_cleaned['Amount'], color='orange')
plt.title('Box Plot for Transaction Amount')
plt.xlabel('Amount')
plt.show()

In [None]:
# Scatter plot of 'Time' vs 'Amount'
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_cleaned['Time'], y=df_cleaned['Amount'], color='purple', alpha=0.6)
plt.title('Time vs Transaction Amount')
plt.xlabel('Time (in seconds)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
# Violin plot of 'Time' distribution by 'Class'
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='Time', data=df_cleaned, palette='muted')
plt.title('Distribution of Time by Fraud Class')
plt.xlabel('Class (0 = Non-Fraud, 1 = Fraud)')
plt.ylabel('Time (in seconds)')
plt.show()

In [None]:
# Box plot of 'Amount' distribution by 'Class'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Class', y='Amount', data=df_cleaned, palette='coolwarm')
plt.title('Distribution of Transaction Amount by Class')
plt.xlabel('Class (0 = Non-Fraud, 1 = Fraud)')
plt.ylabel('Amount')
plt.show()

In [None]:
# Histogram of 'V1' for both classes (Fraud and Non-Fraud)
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned[df_cleaned['Class'] == 0]['V1'], color='blue', kde=True, label='Non-Fraud', bins=50)
sns.histplot(df_cleaned[df_cleaned['Class'] == 1]['V1'], color='red', kde=True, label='Fraud', bins=50)
plt.title('Distribution of V1 Feature by Fraud Class')
plt.xlabel('V1')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# Histogram of 'Amount' for both classes (Fraud and Non-Fraud)
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned[df_cleaned['Class'] == 0]['Amount'], color='lightgreen', kde=True, label='Non-Fraud', bins=50)
sns.histplot(df_cleaned[df_cleaned['Class'] == 1]['Amount'], color='lightcoral', kde=True, label='Fraud', bins=50)
plt.title('Distribution of Transaction Amount by Class')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()

**To address the class imbalance in the dataset, I applied SMOTE (Synthetic Minority Over-sampling Technique) to oversample the minority class (fraudulent transactions) and balance the class distribution.**

# **Feature Importance**

In [None]:
# Separate features and target
X = df_cleaned.drop(columns=['Class'])
y = df_cleaned['Class']

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SelectKBest (Chi-Square Test)
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(abs(X_scaled), y)
selected_features = X.columns[selector.get_support()]
print("Top 10 selected features using SelectKBest:", list(selected_features))

In [None]:
# Feature Importance using RandomForest
model = RandomForestClassifier(random_state=42)
model.fit(X_scaled, y)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print("\nTop features based on RandomForest Importance:\n", feature_importance_df.head(10))

In [None]:
selected_features = ['V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']

# Create a new dataset with selected features
X_selected = pd.DataFrame(X_scaled, columns=X.columns)[selected_features]

# Check the new shape of the dataset
print("Shape of X_selected:", X_selected.shape)

In [None]:
# Define the selected features
selected_features = ['V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']

# Create a new dataset with selected features
X_selected = X[selected_features]

# Scale only the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
# Create a new DataFrame with the scaled data, keeping the selected features as columns
X_selected_scaled = pd.DataFrame(X_scaled, columns=selected_features)

# Split data into train and test sets (80% train, 20% test) BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_selected_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE only to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now your data is safe for training and evaluation
print(f"Shape of X_train_resampled: {X_train_resampled.shape}")
print(f"Shape of y_train_resampled: {y_train_resampled.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

# **Random forest model**

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="viridis", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

# Initialize the Random Forest classifier with class weight adjustment
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1)

# Train the model on the resampled training data
rf_model.fit(X_train_resampled, y_train_resampled)

# Predict on the test se
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1

# Get the optimal threshold (e.g., based on maximum F1-score)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Make predictions with the optimal threshold
y_pred = (y_pred_prob > optimal_threshold).astype(int)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate F1-Score and AUC-ROC score
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("F1-Score:", f1)
print("ROC AUC Score:", roc_auc)

# Plot Confusion Matrix for Random Forest
plot_confusion_matrix(y_test, y_pred, "Random Forest")

# **XGBoost model**

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="viridis", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Train the model on the resampled training data
xgb_model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_probs = xgb_model.predict_proba(X_test)[:, 1]

# Get the optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Make predictions with the optimal threshold
y_pred = (y_probs > optimal_threshold).astype(int)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Plot Confusion Matrix for XGBoost
plot_confusion_matrix(y_test, y_pred, "XGBoost")

# **LR classifier**

In [None]:
# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model on the resampled training data
logreg_model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_probs_logreg = logreg_model.predict_proba(X_test)[:, 1]

# Get the optimal threshold (e.g., based on maximum F1-score)
precision_logreg, recall_logreg, thresholds_logreg = precision_recall_curve(y_test, y_probs_logreg)
f1_scores_logreg = 2 * (precision_logreg * recall_logreg) / (precision_logreg + recall_logreg)
optimal_threshold_logreg = thresholds_logreg[np.argmax(f1_scores_logreg)]

# Make predictions with the optimal threshold
y_pred_logreg = (y_probs_logreg > optimal_threshold_logreg).astype(int)

# Classification Report
print("Classification Report (Logistic Regression):\n", classification_report(y_test, y_pred_logreg))

# Confusion Matrix
print("Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_logreg))

# Print Accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Accuracy (Logistic Regression):", accuracy_logreg)

plot_confusion_matrix(y_test, y_pred_logreg, "Logistic Regression")

## **Cross-validation**

In [None]:
# Initialize models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
logreg_model = LogisticRegression(random_state=42)

# Perform cross-validation
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
logreg_scores = cross_val_score(logreg_model, X_train, y_train, cv=5)

# Print results
print("Random Forest CV Scores:", rf_scores)
print("Random Forest Mean Accuracy:", rf_scores.mean())

print("\nXGBoost CV Scores:", xgb_scores)
print("XGBoost Mean Accuracy:", xgb_scores.mean())

print("\nLogistic Regression CV Scores:", logreg_scores)
print("Logistic Regression Mean Accuracy:", logreg_scores.mean())

# **Saving the Models**

In [None]:
import joblib

# Save the Decision Tree model
joblib.dump(rf_model, '/content/rf_model.pkl')

# Save the XGBoost model
joblib.dump(xgb_model, '/content/xgb_model.pkl')

# Save the KNN model
joblib.dump(logreg_model, '/content/logreg_model.pkl')

print("Models saved successfully.")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on your training data
scaler.fit(X_train)

# Save the scaler
joblib.dump(scaler, '/content/scaler.pkl')

# **Real-Time Fraud Detection Prediction System with Model Selection**

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Load the scaler and models
scaler = joblib.load('/content/scaler.pkl')
rf_model = joblib.load('/content/rf_model.pkl')
xgb_model = joblib.load('/content/xgb_model.pkl')
logreg_model = joblib.load('/content/logreg_model.pkl')

# Selected features used in the model
selected_features = ['V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']

In [None]:
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
import joblib

# Load the scaler and models
scaler = joblib.load('/content/scaler.pkl')
rf_model = joblib.load('/content/rf_model.pkl')
xgb_model = joblib.load('/content/xgb_model.pkl')
logreg_model = joblib.load('/content/logreg_model.pkl')

# Selected features used in the model
selected_features = ['V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']

# Input style with better spacing
input_style = {'description_width': 'initial'}

# Create widgets for user input
V4_input = widgets.FloatText(value=0.0, description="V4:", style=input_style, layout=widgets.Layout(width='250px'))
V7_input = widgets.FloatText(value=0.0, description="V7:", style=input_style, layout=widgets.Layout(width='250px'))
V10_input = widgets.FloatText(value=0.0, description="V10:", style=input_style, layout=widgets.Layout(width='250px'))
V11_input = widgets.FloatText(value=0.0, description="V11:", style=input_style, layout=widgets.Layout(width='250px'))
V12_input = widgets.FloatText(value=0.0, description="V12:", style=input_style, layout=widgets.Layout(width='250px'))
V14_input = widgets.FloatText(value=0.0, description="V14:", style=input_style, layout=widgets.Layout(width='250px'))
V16_input = widgets.FloatText(value=0.0, description="V16:", style=input_style, layout=widgets.Layout(width='250px'))
V17_input = widgets.FloatText(value=0.0, description="V17:", style=input_style, layout=widgets.Layout(width='250px'))

model_choice_widget = widgets.Dropdown(
    options=['Random forest', 'XGBoost', 'Logistic Regression'],
    value='Random forest',
    description='Model:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='250px')
)

# Output widget to display predictions
output = widgets.Output()

# Function to display feedback
def show_feedback(msg):
    with output:
        print(f"<h4 style='color: #FF5722; font-family: Arial, sans-serif;'>{msg}</h4>")

# Function to predict and display results when button is clicked
def predict_model(V4, V7, V10, V11, V12, V14, V16, V17, model_choice):
    with output:
        # Create a dataframe from the input values
        input_data = pd.DataFrame({
            'V4': [V4],
            'V7': [V7],
            'V10': [V10],
            'V11': [V11],
            'V12': [V12],
            'V14': [V14],
            'V16': [V16],
            'V17': [V17]
        })

        # Ensure input_data is in the same feature order as the model training data
        input_data = input_data[selected_features]

        # Scale the input data using the fitted scaler
        input_data_scaled = scaler.transform(input_data)

        # Choose the model based on user input
        if model_choice == 'Random forest':
            model = rf_model
        elif model_choice == 'XGBoost':
            model = xgb_model
        elif model_choice == 'Logistic Regression':
            model = logreg_model

        # Make the prediction
        prediction = model.predict(input_data_scaled)

        # Display the prediction
        if prediction[0] == 1:
            show_feedback("🚨 Prediction: Fraudulent!")
        else:
            show_feedback("✅ Prediction: Non-fraudulent")

# Function to trigger prediction on button click
def on_button_click(b):
    # Get the values from the widgets
    V4 = V4_input.value
    V7 = V7_input.value
    V10 = V10_input.value
    V11 = V11_input.value
    V12 = V12_input.value
    V14 = V14_input.value
    V16 = V16_input.value
    V17 = V17_input.value
    model_choice = model_choice_widget.value

    # Call the prediction function
    predict_model(V4, V7, V10, V11, V12, V14, V16, V17, model_choice)

# Create a button for making the prediction
predict_button = widgets.Button(description="Predict", button_style='success', layout=widgets.Layout(width='250px'))
predict_button.on_click(on_button_click)

# Create a heading
heading = widgets.HTML(value="<h2 style='text-align:center; color:#FF5722; font-family:Arial, sans-serif;'>Fraud Detection Prediction</h2>")

# Customizing the layout of input fields
input_widgets = widgets.VBox([heading,
                              V4_input, V7_input, V10_input, V11_input, V12_input, V14_input, V16_input, V17_input,
                              model_choice_widget, predict_button],
                             layout=widgets.Layout(padding='10px', background_color='#F5F5F5', border='2px solid #FF5722'))

# Customizing the output area
output.layout.height = '150px'
output.layout.border = '2px solid #FF5722'
output.layout.padding = '10px'
output.layout.margin = '10px'
output.layout.background_color = '#f9f9f9'

# Display the input widgets and output area
display(input_widgets, output)
