<a href="https://colab.research.google.com/github/Rupavathsipayi/C-language/blob/main/Copy_of_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
kartik2112_fraud_detection_path = kagglehub.dataset_download('kartik2112/fraud-detection')

print('Data source import complete.')


# Read & Load Data

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score
from mpl_toolkits.mplot3d import Axes3D  # For 3D plots
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score


In [None]:
df = pd.read_csv('/content/fraudTrain.csv')
t_df = pd.read_csv('/content/fraudTest.csv')
df

In [None]:
df['is_fraud'].value_counts()

# Preprocessing

In [None]:
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(['cc_num', 'trans_date_trans_time', 'first', 'last', 'dob', 'street', 'trans_num', 'unix_time', 'merchant'], axis=1)

    # Handle outliers for 'amt' and 'city_pop'
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_columns:
        if col in ['amt', 'city_pop']:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

    # Encode categorical variables
    label_encoder = LabelEncoder()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_columns:
        df[col] = label_encoder.fit_transform(df[col])

    # Convert gender to binary
    df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

    # Normalize numerical features
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    return df_scaled



In [None]:
df_preprocessed = preprocess_data(df)
t_df_preprocessed = preprocess_data(t_df)

print(df_preprocessed.head())
print(t_df_preprocessed.head())

In [None]:
df_preprocessed.isna().sum()

In [None]:
# check coulmns type and missing values and shape of data
df_preprocessed.info()
# this mean nulls = zero
# we need to convert any object to numrical value

In [None]:
# get statistics on numrical columns ( int or float )
df_preprocessed.describe()

In [None]:
print(df_preprocessed.duplicated().sum())


In [None]:
df_preprocessed.dtypes

In [None]:
X_train = df_preprocessed.drop('is_fraud', axis=1)
y_train = df_preprocessed['is_fraud']
X_test = t_df_preprocessed.drop('is_fraud', axis=1)
y_test = t_df_preprocessed['is_fraud']

# EDA and Visualizations

### class Imbalance

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='is_fraud', y='amt', data=df_preprocessed, palette='coolwarm')
plt.title('Violin Plot: Transaction Amount by Fraud')
plt.xlabel('Is Fraud (1 = Fraud, 0 = Non-Fraud)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
# KDE Plot with Clear Labels
plt.figure(figsize=(10, 6))
kde_plot = sns.kdeplot(data=df_preprocessed,
                       x='amt',
                       hue='is_fraud',
                       palette='coolwarm',
                       fill=True)

### Geospatial Analysis

In [None]:
# Plot fraud incidents on a map
plt.figure(figsize=(12, 8))
sns.scatterplot(x='long', y='lat', hue='is_fraud', data=df, palette='coolwarm', alpha=0.6)
plt.title('Geospatial Visualization of Fraud')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Is Fraud', loc='upper right')
plt.show()

### Temporal Analysis

In [None]:
# Convert transaction time to datetime
#df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Group by date and calculate fraud count
#fraud_over_time = df.groupby(df['trans_date_trans_time'].dt.date)['is_fraud'].sum()

# Plot time series
#plt.figure(figsize=(12, 6))
#fraud_over_time.plot()
#plt.title('Fraud Over Time')
#plt.xlabel('Date')
#plt.ylabel('Fraud Count')
#plt.show()
# Convert transaction time to datetime, handling errors and specifying format if necessary
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
#The 'coerce' argument handles errors by setting invalid parsing to NaT (Not a Time)
#The format argument is explicitly specified for consistency

#If the format is not consistent, you might need to try:
#df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce', format='mixed')
#or specify a different appropriate format.

# Group by date and calculate fraud count
fraud_over_time = df.groupby(df['trans_date_trans_time'].dt.date)['is_fraud'].sum()

# Plot time series
plt.figure(figsize=(12, 6))
fraud_over_time.plot()
plt.title('Fraud Over Time')
plt.xlabel('Date')
plt.ylabel('Fraud Count')
plt.show()

### Features

#### Transaction Amount, City Population, and Fraud

In [None]:
# 7. 3D Scatter Plot: Transaction Amount, City Population, and Fraud
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(
    df_preprocessed['amt'],
    df_preprocessed['city_pop'],
    df_preprocessed['is_fraud'],
    c=df_preprocessed['is_fraud'],
    cmap='coolwarm',
    s=20
)

# Labels
ax.set_xlabel('Transaction Amount (amt)')
ax.set_ylabel('City Population')
ax.set_zlabel('Is Fraud (1 = Fraud, 0 = Non-Fraud)')
plt.title('3D Scatter Plot: Transaction Amount, City Population, and Fraud')
plt.show()

#### City Population

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df_preprocessed, x='city_pop', hue='is_fraud', palette='coolwarm', fill=True)
plt.title('KDE Plot: City Population by Fraud')
plt.xlabel('City Population')
plt.ylabel('Density')
plt.legend(title='Is Fraud', labels=['Non-Fraud (0)', 'Fraud (1)'])
plt.show()

In [None]:
# Calculate fraud proportion
fraud_proportion = df['is_fraud'].value_counts(normalize=True)

# Plot pie chart
plt.figure(figsize=(6, 6))
plt.pie(fraud_proportion, labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Proportion of Fraudulent vs Non-Fraudulent Transactions')
plt.show()

In [2]:

# Distribution of Transaction Amount (amt)
plt.figure(figsize=(10, 6))
sns.histplot(df_preprocessed['amt'], bins=50, kde=True, color='blue')
plt.title('Distribution of Transaction Amount (amt)')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()


NameError: name 'plt' is not defined

In [None]:

# Distribution of City Population
plt.figure(figsize=(10, 6))
sns.histplot(df_preprocessed['city_pop'], bins=50, kde=True, color='green')
plt.title('Distribution of City Population')
plt.xlabel('City Population')
plt.ylabel('Frequency')
plt.show()


In [None]:

# Fraud Distribution by Gender
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', hue='is_fraud', data=df_preprocessed, palette='coolwarm')
plt.title('Fraud Distribution by Gender')
plt.xlabel('Gender (1 = Male, 0 = Female)')
plt.ylabel('Count')
plt.legend(title='Is Fraud', loc='upper right')
plt.show()

### Correlation Matrix

In [3]:

# Correlation Heatmap
plt.figure(figsize=(12, 8))
corr = df_preprocessed.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


NameError: name 'plt' is not defined

# Apply NearMiss Undersampling

In [None]:
# nm = NearMiss(version=1)
# X_resampled, y_resampled = nm.fit_resample(X_train, y_train)

# print("Class distribution after NearMiss:", Counter(y_resampled))


#  Apply SMOTE Oversampling

In [4]:
#from imblearn.over_sampling import SMOTE
#from collections import Counter

# Check class distribution before applying SMOTE
#print("Class distribution before SMOTE:", Counter(y_train))

# Apply SMOTE with a sampling strategy that makes sense based on class distribution
#smote = SMOTE(sampling_strategy='auto', random_state=42)  # Balances both classes equally
#X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check the new class distribution after applying SMOTE
#print("Class distribution after SMOTE:", Counter(y_train_balanced))
from imblearn.over_sampling import SMOTE
from collections import Counter

# Check class distribution before applying SMOTE
print("Class distribution before SMOTE:", Counter(y_train))

# Drop rows with NaN values in y_train and corresponding rows in X_train
# Get the indices of rows with NaN values in y_train
nan_indices = y_train[y_train.isnull()].index

# Drop those rows from both X_train and y_train
X_train = X_train.drop(index=nan_indices)
y_train = y_train.drop(index=nan_indices)

# Apply SMOTE with a sampling strategy that makes sense based on class distribution
smote = SMOTE(sampling_strategy='auto', random_state=42)  # Balances both classes equally
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check the new class distribution after applying SMOTE
print("Class distribution after SMOTE:", Counter(y_train_balanced))


NameError: name 'y_train' is not defined

In [None]:
print(y_train_balanced.value_counts())
sns.countplot(x=y_train_balanced)
plt.title("Class Distribution")
plt.show()


# Machine Learning Models

## Isolation Forest
Isolates anomalies by randomly splitting the data.

In [None]:
#iso_forest = IsolationForest(contamination=0.02, random_state=42)

# Train and predict on training set
#iso_forest.fit(X_train)
#train_pred = iso_forest.predict(X_train)
#train_pred = np.where(train_pred == -1, 1, 0)  # Convert -1 (outlier) to 1, 0 (inlier) to 0

# Predict on test set
#test_pred = iso_forest.predict(X_test)
#test_pred = np.where(test_pred == -1, 1, 0)

# Calculate accuracy
#train_accuracy = accuracy_score(y_train, train_pred)
#test_accuracy = accuracy_score(y_test, test_pred)

# Print results
#print("Isolation Forest - Train Accuracy: {:.4f}".format(train_accuracy))
#print("Isolation Forest - Test Accuracy: {:.4f}".format(test_accuracy))
#print("Classification Report for Test Set:")
#print(classification_report(y_test, test_pred))# ... (your existing code) ...
#iso_forest = IsolationForest(contamination=0.02, random_state=42)

# Train and predict on training set
#iso_forest.fit(X_train)
#train_pred = iso_forest.predict(X_train)
#train_pred = np.where(train_pred == -1, 1, 0)  # Convert -1 (outlier) to 1, 0 (inlier) to 0

# Predict on test set
#test_pred = iso_forest.predict(X_test)
#test_pred = np.where(test_pred == -1, 1, 0)

# Calculate accuracy
#train_accuracy = accuracy_score(y_train, train_pred)
#test_accuracy = accuracy_score(y_test, test_pred)

# Print results
#print("Isolation Forest - Train Accuracy: {:.4f}".format(train_accuracy))
#print("Isolation Forest - Test Accuracy: {:.4f}".format(test_accuracy))
#print("Classification Report for Test Set:")
#print(classification_report(y_test, test_pred))# ... (your existing code) ...
# ... (your existing code) ...

# ... (your existing code) ...

# Check and handle NaN values in y_test before calculating accuracy
if y_test.isnull().any():
    # Option 1: Remove rows with NaN values from both X_test and y_test
    nan_indices = y_test[y_test.isnull()].index
    X_test = X_test.drop(index=nan_indices)
    y_test = y_test.drop(index=nan_indices)

    print("Removed rows with NaN values from X_test and y_test.")
else:
    print("No NaN values found in y_test.")

# Train and predict on training set using Isolation Forest with balanced data
iso_forest = IsolationForest(contamination=0.02, random_state=42)  # Define iso_forest here
iso_forest.fit(X_train_balanced)  # Use balanced data for training
train_pred = iso_forest.predict(X_train_balanced)  # Use balanced data for prediction
train_pred = np.where(train_pred == -1, 1, 0)  # Convert -1 (outlier) to 1, 0 (inlier) to 0

# Calculate accuracy using the balanced training data and predictions
train_accuracy = accuracy_score(y_train_balanced, train_pred)  # Use y_train_balanced

# Predict on test set and convert predictions
test_pred = iso_forest.predict(X_test)  # Predict on the test data
test_pred = np.where(test_pred == -1, 1, 0)  # Convert predictions to 0 and 1

# Ensure test_pred and y_test have the same length
# The line below is not needed as the predict function will generate values matching the X_test input
#test_pred = test_pred[:len(y_test)]  # Truncate test_pred to match y_test length

# Now you can calculate the test accuracy
test_accuracy = accuracy_score(y_test, test_pred)

# ... (rest of your code) ...

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, test_pred)
# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Isolation Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Get anomaly scores from Isolation Forest
test_scores = iso_forest.decision_function(X_test)
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, -test_scores)
auc_score = roc_auc_score(y_test, -test_scores)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.title('ROC Curve for Isolation Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

## Local Outlier Factor (LOF)
Compares local density of a point to its neighbors.

In [None]:
lof = LocalOutlierFactor(contamination=0.02)

# Predict on training set
train_pred = lof.fit_predict(X_train)
train_pred = np.where(train_pred == -1, 1, 0)

# Predict on test set
test_pred = lof.fit_predict(X_test)
test_pred = np.where(test_pred == -1, 1, 0)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# Print results
print("Local Outlier Factor (LOF) - Train Accuracy: {:.4f}".format(train_accuracy))
print("Local Outlier Factor (LOF) - Test Accuracy: {:.4f}".format(test_accuracy))
print("Classification Report for Test Set:")
print(classification_report(y_test, test_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, test_pred)
# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Local Outlier Factor (LOF)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Get negative outlier factor scores for the test set
test_scores = -lof.negative_outlier_factor_
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, test_scores)
auc_score = roc_auc_score(y_test, test_scores)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.title('ROC Curve for Local Outlier Factor (LOF)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

## One-Class SVM
Learns a boundary around normal data points.

In [None]:
#Takes too loooooooooooong to run
one_class_svm = OneClassSVM(nu=0.02)

# Train and predict on training set
one_class_svm.fit(X_train)
train_pred = one_class_svm.predict(X_train)
train_pred = np.where(train_pred == -1, 1, 0)

# Predict on test set
test_pred = one_class_svm.predict(X_test)
test_pred = np.where(test_pred == -1, 1, 0)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# Print results
print("One-Class SVM - Train Accuracy: {:.4f}".format(train_accuracy))
print("One-Class SVM - Test Accuracy: {:.4f}".format(test_accuracy))
print("Classification Report for Test Set:")
print(classification_report(y_test, test_pred))

In [None]:
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for One-Class SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
test_scores = one_class_svm.decision_function(X_test)
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, -test_scores)  # Use negative scores to align with ROC curve convention
auc_score = roc_auc_score(y_test, -test_scores)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.title('ROC Curve for One-Class SVM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

## DBSCAN
Groups data into clusters; points not in any cluster are outliers.

In [1]:
dbscan = DBSCAN(eps=0.5, min_samples=10)
# Predict on training set
train_pred = dbscan.fit_predict(X_train)
train_pred = np.where(train_pred == -1, 1, 0)

# Predict on test set
test_pred = dbscan.fit_predict(X_test)
test_pred = np.where(test_pred == -1, 1, 0)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# Print results
print("DBSCAN - Train Accuracy: {:.4f}".format(train_accuracy))
print("DBSCAN - Test Accuracy: {:.4f}".format(test_accuracy))
print("Classification Report for Test Set:")
print(classification_report(y_test, test_pred))

NameError: name 'DBSCAN' is not defined

In [None]:
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for DBSCAN')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Calculate the distance to the nearest core point
from sklearn.neighbors import NearestNeighbors
# Fit NearestNeighbors on the training set
nbrs = NearestNeighbors(n_neighbors=10).fit(X_train)
# Calculate distances for the test set
distances, _ = nbrs.kneighbors(X_test)
test_scores = -np.mean(distances, axis=1)  # Use negative distances as anomaly scores
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, test_scores)
auc_score = roc_auc_score(y_test, test_scores)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.title('ROC Curve for DBSCAN')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Deep Learning Models

## Autoencoders
 learning to reconstruct normal data. Anomalies are detected based on high reconstruction errors.

In [None]:
input_dim = X_train.shape[1]

# Encoder
encoder_input = layers.Input(shape=(input_dim,))
encoded = layers.Dense(8, activation="relu")(encoder_input)
encoded = layers.Dense(4, activation="relu")(encoded)

# Decoder
decoded = layers.Dense(8, activation="relu")(encoded)
decoded = layers.Dense(input_dim, activation="sigmoid")(decoded)

# Define autoencoder model
autoencoder = models.Model(inputs=encoder_input, outputs=decoded)

# Compile Autoencoder
autoencoder.compile(optimizer="adam", loss="mse")

# Train Autoencoder
autoencoder.fit(X_train, X_train, epochs=20, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

# Compute reconstruction errors
train_reconstructed = autoencoder.predict(X_train)
train_mse = np.mean(np.power(X_train - train_reconstructed, 2), axis=1)

test_reconstructed = autoencoder.predict(X_test)
test_mse = np.mean(np.power(X_test - test_reconstructed, 2), axis=1)

# Set threshold for anomaly detection (95th percentile of training MSE)
threshold = np.percentile(train_mse, 95)

# Predict anomalies
train_pred = (train_mse > threshold).astype(int)
test_pred = (test_mse > threshold).astype(int)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# Print results
print("Autoencoder - Train Accuracy: {:.4f}".format(train_accuracy))
print("Autoencoder - Test Accuracy: {:.4f}".format(test_accuracy))
print("Classification Report for Test Set:")
print(classification_report(y_test, test_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Autoencoder')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## One-Class Neural Networks (OC-NN)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Assuming X_train, X_test, y_train, y_test are already defined

# Define input dimension
input_dim = X_train.shape[1]

# Define the One-Class Neural Network (OC-NN)
inputs = Input(shape=(input_dim,))
x = Dense(64, activation='relu')(inputs)
x = Dense(32, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

oc_nn = Model(inputs, outputs)
oc_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = oc_nn.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
y_pred = oc_nn.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for OC-NN')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))

## Deep Support Vector Data Description (Deep SVDD)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

# Assuming X_train, X_test, y_train, y_test are already defined

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define input dimension
input_dim = X_train.shape[1]

# Define the Deep SVDD model with increased capacity and dropout
inputs = Input(shape=(input_dim,))
x = Dense(128, activation='relu')(inputs)  # Increased neurons
x = Dropout(0.5)(x)  # Dropout for regularization
x = Dense(64, activation='relu')(x)  # Additional layer
x = Dropout(0.5)(x)  # Dropout for regularization
outputs = Dense(1, activation='linear')(x)

deep_svdd = Model(inputs, outputs)

# Compile the model with a different loss function and learning rate
optimizer = Adam(learning_rate=0.0001)
deep_svdd.compile(optimizer=optimizer, loss='mean_squared_error')

# Train the model without early stopping
history = deep_svdd.fit(X_train, X_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_scores = deep_svdd.predict(X_test)

# Set a lower threshold for anomaly detection (e.g., 90th percentile)
train_scores = deep_svdd.predict(X_train)
threshold = np.percentile(train_scores, 90)  # Lower threshold to reduce false negatives

# Predict anomalies
test_pred = (test_scores > threshold).astype(int)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Deep SVDD')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, test_pred, target_names=['Non-Fraud', 'Fraud']))

# visualize to compare Models  

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Model names
models = ["Isolation Forest", "Local Outlier Factor", "Autoencoder", "OC-NN", "Deep SVDD"]

# Accuracy scores
train_accuracy = [0.9750, 0.9759, 0.9453, 1.00, 1.00]
test_accuracy = [0.9753, 0.9768, 0.9461, 1.00, 0.89]

# Precision scores (Fraud class)
precision = [0.01, 0.02, 0.01, 0.83, 0.01]

# Recall scores (Fraud class)
recall = [0.08, 0.08, 0.11, 0.13, 0.28]

# F1-score (Fraud class)
f1_score = [0.02, 0.03, 0.02, 0.22, 0.02]

# Bar chart settings
bar_width = 0.15
index = np.arange(len(models))

# Plot accuracy
plt.figure(figsize=(12, 6))
plt.bar(index, train_accuracy, bar_width, label='Train Accuracy', color='blue')
plt.bar(index + bar_width, test_accuracy, bar_width, label='Test Accuracy', color='cyan')
plt.bar(index + 2 * bar_width, precision, bar_width, label='Precision (Fraud)', color='green')
plt.bar(index + 3 * bar_width, recall, bar_width, label='Recall (Fraud)', color='red')
plt.bar(index + 4 * bar_width, f1_score, bar_width, label='F1-Score (Fraud)', color='purple')

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Performance Comparison of Fraud Detection Models')
plt.xticks(index + 2 * bar_width, models, rotation=15)
plt.legend()
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


### ROC Curves (For Each Model)
#### need edit

In [None]:
#from sklearn.metrics import roc_curve, auc
#import matplotlib.pyplot as plt

# Replace with actual values from each model
#models = {
    #"Isolation Forest": isolation_forest_predictions_proba,
    #"Local Outlier Factor": lof_predictions_proba,
   # "Autoencoder": autoencoder_predictions_proba,
  #  "OC-NN": oc_nn_predictions_proba,
 #   "Deep SVDD": deep_svdd_predictions_proba
#}

#plt.figure(figsize=(8, 6))

#for model_name, y_scores in models.items():
   # fpr, tpr, _ = roc_curve(y_true, y_scores)  # Use real labels
  #  roc_auc = auc(fpr, tpr)
 #   plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")

#plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random baseline
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')
#plt.title('ROC Curve for Fraud Detection Models')
#plt.legend()
#plt.grid(True)
#plt.show()

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get predicted probabilities for each model
isolation_forest_predictions_proba = iso_forest.decision_function(X_test)
lof_predictions_proba = -lof.negative_outlier_factor_  # Use negative outlier factor as scores
# For autoencoder, use reconstruction error as anomaly score
autoencoder_predictions_proba = test_mse
oc_nn_predictions_proba = oc_nn.predict(X_test).ravel()
deep_svdd_predictions_proba = deep_svdd.predict(X_test).ravel()


models = {
    "Isolation Forest": isolation_forest_predictions_proba,
    "Local Outlier Factor": lof_predictions_proba,
    "Autoencoder": autoencoder_predictions_proba,
    "OC-NN": oc_nn_predictions_proba,
    "Deep SVDD": deep_svdd_predictions_proba
}

plt.figure(figsize=(8, 6))

for model_name, y_scores in models.items():
    fpr, tpr, _ = roc_curve(y_test, y_scores)  # Use real labels
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random baseline
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Fraud Detection Models')
plt.legend()
plt.grid(True)
plt.show()


###  Confusion Matrices
#### need edit

In [None]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#odels_predictions = {

 #   "Isolation Forest": isolation_forest_predictions,
    #"Local Outlier Factor": lof_predictions,
   # "Autoencoder": autoencoder_predictions,
  #  "OC-NN": oc_nn_predictions,
 #   "Deep SVDD": deep_svdd_predictions
#}

#for model_name, y_pred in models_predictions.items():
   # cm = confusion_matrix(y_true, y_pred)
   # plt.figure(figsize=(6, 6))
   # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-Fraud", "Fraud"])
   # disp.plot(cmap=plt.cm.Blues, values_format='d')
   # plt.title(f"Confusion Matrix - {model_name}")
  #  plt.show()

#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#odels_predictions = {

 #   "Isolation Forest": isolation_forest_predictions,
    #"Local Outlier Factor": lof_predictions,
   # "Autoencoder": autoencoder_predictions,
  #  "OC-NN": oc_nn_predictions,
 #   "Deep SVDD": deep_svdd_predictions
#}

#for model_name, y_pred in models_predictions.items():
   # cm = confusion_matrix(y_true, y_pred)
   # plt.figure(figsize=(6, 6))
   # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-Fraud", "Fraud"])
   # disp.plot(cmap=plt.cm.Blues, values_format='d')
   # plt.title(f"Confusion Matrix - {model_name}")
  #  plt.show()


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay #Fixed indentation
import matplotlib.pyplot as plt

# Assuming you have the following predictions from previous model runs:
# isolation_forest_predictions, lof_predictions, autoencoder_predictions, oc_nn_predictions, deep_svdd_predictions

# Store predictions in a dictionary (Replace with actual predictions)
# Replace these with the actual predictions you calculated earlier
isolation_forest_predictions = iso_forest.predict(X_test)




### Precision-Recall Curves
#### need edit

In [None]:
#from sklearn.metrics import precision_recall_curve

#plt.figure(figsize=(8, 6))

#for model_name, y_scores in models.items():
  #  precision, recall, _ = precision_recall_curve(y_true, y_scores)
 #   plt.plot(recall, precision, label=f"{model_name}")

#plt.xlabel('Recall')
#plt.ylabel('Precision')
#plt.title('Precision-Recall Curve for Fraud Detection Models')
#plt.legend()
#plt.grid(True)
#plt.show()

from sklearn.metrics import precision_recall_curve

plt.figure(figsize=(8, 6))

for model_name, y_scores in models.items():
    # Replace y_true with y_test
    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    plt.plot(recall, precision, label=f"{model_name}")

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Fraud Detection Models')
plt.legend()
plt.grid(True)
plt.show()


# Dash Board

# Deployment