# Stage 3 - Aiming Perfection With Deep Learning 
### For Stage 3 of the project, even though we've achieved near-perfect results with traditional machine learning algorithms, we aim to attain absolute perfection by harnessing neural networks. This will enable us to explore more complex architectures like fully connected networks (MLPs), pushing the boundaries of performance to achieve even higher precision and recall in fraud detection.

# 0. Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import zscore
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, fbeta_score
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import optuna
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier


# 1. Useful steps from stage 1

In [3]:
cleaned_data = pd.read_csv('cleaned_data.csv', sep=',')
cleaned_data.head()

X = cleaned_data.drop(columns=['fraud'])
y = cleaned_data['fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def remove_outliers_zscore(data, columns, threshold=3):
    """
    Removes outliers from the specified columns using the Z-score method.

    Parameters:
    - data : pandas DataFrame
        The input DataFrame containing the data.
    - columns : list
        List of numeric columns to check for outliers.
    - threshold : float, optional (default: 3)
        The Z-score threshold to identify outliers. Values above this threshold
        (or below -threshold) will be considered as outliers.

    Returns:
    - DataFrame : A new DataFrame with outliers removed.
    """
    df = data.copy() # we create a copy of the input DataFrame to prevent memory issues

    for col in columns:
        df['zscore'] = zscore(df[col])
        df = df[(df['zscore'].abs() < threshold)]

    df = df.drop(columns=['zscore'])
    return df

columns_to_check = ['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price'] # we of course only care about non-categorical columns as categorical columns cannot have outliers.
X_train_clean = remove_outliers_zscore(X_train, columns_to_check)
y_train_clean = y_train.loc[X_train_clean.index] # we need to make sure to only keep the labels corresponding to the cleaned data


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_test_scaled = scaler.transform(X_test) # we only transform the test data, we do not fit the scaler again to avoid data leakage.


f2_scorer = make_scorer(fbeta_score, beta=2)

We do not use SMOTE with neural networks because it can lead to overfitting by introducing synthetic examples that add noise and redundancy, which affects the model's ability to generalize effectively. Neural networks models already have a great learning ability.