## Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb

### Ignore all FutureWarning messages globally

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning)

## Data Cleaning and Preprocessing

### Load dataset

In [3]:
# Load the dataset into a pandas DataFrame  
df = pd.read_csv("fraudTrain.csv")

### Preprocessing

In [4]:
# Convert date-time columns to datetime format
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Clean 'gender' column and encode as 0 for female, 1 for male
df['gender'] = df['gender'].str.lower().apply(lambda x: 1 if x.startswith('m') else (0 if x.startswith('f') else None))

# Clean columns with special characters and replace with NaN
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x: re.sub(r'[^A-Za-z0-9]', '', str(x)))
        df.loc[df[col].str.len() <= 1, col] = np.nan

# Remove columns starting with 'Unnamed'
unnamed_columns = [col for col in df.columns if col.startswith('Unnamed')]
df = df.drop(columns=unnamed_columns)

# Convert all string columns to lowercase
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.lower()

# Identify and delete duplicate records
duplicate = df.duplicated().sum()
if duplicate > 0:
    df = df.drop_duplicates()

# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
    elif df[col].dtype in ['int64', 'float64']:
        mean_val = df[col].mean()
        df[col].fillna(mean_val, inplace=True)

# Verify if there are any remaining missing values
print("Missing values after handling:\n", df.isnull().sum())

Missing values after handling:
 trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [5]:
df.describe()

Unnamed: 0,cc_num,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,4.17192e+17,70.35104,0.4525513,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,1.308806e+18,160.316,0.4977437,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,60416210000.0,1.0,0.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,180042900000000.0,9.65,0.0,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,3521417000000000.0,47.52,0.0,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,4642255000000000.0,83.14,1.0,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,4.992346e+18,28948.9,1.0,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


## Feature Engineering

In [6]:
# Feature engineering
df['trans_Date'] = df.trans_date_trans_time.dt.date
df['trans_Time'] = df.trans_date_trans_time.dt.time
df['trans_Date'] = pd.to_datetime(df['trans_Date'], format='%Y-%m-%d')
df['age'] = df['dob'].apply(lambda x: (pd.Timestamp.now().year - x.year))

# Drop unnecessary columns
df = df.drop(['trans_date_trans_time', 'dob', 'first', 'last', 'unix_time', 'cc_num', 'trans_num'], axis=1)

# Convert categorical columns to numeric using Label Encoding
label_encoder = LabelEncoder()
for col in ['state', 'job', 'merchant', 'city', 'street', 'category']:
    df[col] = label_encoder.fit_transform(df[col])

# Convert date columns to numeric (timestamp)
df['trans_Date'] = df['trans_Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
df['trans_Time'] = df['trans_Time'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second if pd.notnull(x) else np.nan)

### Train-Test Split

In [7]:
# Split the data into features and target variable
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)

# %% [markdown]
# ## Handling Imbalanced Data with SMOTE

####PS NEED TO DO NORMALIZATION HERE #####


# %%
# Apply SMOTE to handle class imbalance
smt = SMOTE(random_state=45, k_neighbors=5)
X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)

## Applying Model

### XGBoost

In [8]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=45)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the XGBoost model
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    386751
           1       0.34      0.91      0.50      2252

    accuracy                           0.99    389003
   macro avg       0.67      0.95      0.74    389003
weighted avg       1.00      0.99      0.99    389003

XGBoost Confusion Matrix:
 [[382749   4002]
 [   193   2059]]


## Hyperparameter tuning


In [9]:

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter distribution
param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=45)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, 
                                   n_iter=50, cv=3, n_jobs=-1, verbose=2, scoring='f1', random_state=45)

# Fit the random search to the data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and the best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")

# Train the XGBoost model with the best parameters
xgb_model_best = xgb.XGBClassifier(**best_params, eval_metric='mlogloss', random_state=45)
xgb_model_best.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_xgb_best = xgb_model_best.predict(X_test)

# Evaluate the optimized XGBoost model
print("Optimized XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb_best))
print("Optimized XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_best))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 1.0}
Best F1 Score: 0.9993686614947905
Optimized XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386751
           1       0.86      0.91      0.88      2252

    accuracy                           1.00    389003
   macro avg       0.93      0.95      0.94    389003
weighted avg       1.00      1.00      1.00    389003

Optimized XGBoost Confusion Matrix:
 [[386431    320]
 [   211   2041]]


In [10]:
# Evaluate the XGBoost model after Hyperparameter tuning
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb_best))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_best))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386751
           1       0.86      0.91      0.88      2252

    accuracy                           1.00    389003
   macro avg       0.93      0.95      0.94    389003
weighted avg       1.00      1.00      1.00    389003

XGBoost Confusion Matrix:
 [[386431    320]
 [   211   2041]]


###  Evaluate the optimized XGBoost model

In [11]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

# Evaluate the optimized XGBoost model
f1 = f1_score(y_test, y_pred_xgb_best)
accuracy = accuracy_score(y_test, y_pred_xgb_best)
precision = precision_score(y_test, y_pred_xgb_best)
recall = recall_score(y_test, y_pred_xgb_best)

print("Optimized XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb_best))
print("Optimized XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_best))

print( "Overall result")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Optimized XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386751
           1       0.86      0.91      0.88      2252

    accuracy                           1.00    389003
   macro avg       0.93      0.95      0.94    389003
weighted avg       1.00      1.00      1.00    389003

Optimized XGBoost Confusion Matrix:
 [[386431    320]
 [   211   2041]]
Overall result
F1 Score: 0.8848905267721656
Accuracy: 0.9986349719667972
Precision: 0.8644642100804744
Recall: 0.9063055062166963


### Test Data Evaluation

In [23]:
# Load the test dataset into a pandas DataFrame  
df_test = pd.read_csv("fraudTest.csv")

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# %% [markdown]
# ### Ignore all FutureWarning messages globally

# %%
warnings.filterwarnings("ignore", category=FutureWarning)

# %% [markdown]
# ## Data Cleaning and Preprocessing

# %% [markdown]
# ### Load dataset

# %%
# Load the dataset into a pandas DataFrame  
df_test = pd.read_csv("fraudTest.csv")

# %% [markdown]
# ### Preprocessing

# %%
# Convert date-time columns to datetime format
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'], errors='coerce')
df_test['dob'] = pd.to_datetime(df_test['dob'], errors='coerce')

# Clean 'gender' column and encode as 0 for female, 1 for male
df_test['gender'] = df_test['gender'].str.lower().apply(lambda x: 1 if x.startswith('m') else (0 if x.startswith('f') else None))

# Clean columns with special characters and replace with NaN
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].apply(lambda x: re.sub(r'[^A-Za-z0-9]', '', str(x)))
        df_test.loc[df_test[col].str.len() <= 1, col] = np.nan

# Remove columns starting with 'Unnamed'
unnamed_columns = [col for col in df_test.columns if col.startswith('Unnamed')]
df_test = df_test.drop(columns=unnamed_columns)

# Convert all string columns to lowercase
for col in df_test.select_dtypes(include='object'):
    df_test[col] = df_test[col].str.lower()

# Identify and delete duplicate records
duplicate = df_test.duplicated().sum()
if duplicate > 0:
    df_test = df_test.drop_duplicates()

# Handle missing values
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        mode_val = df_test[col].mode()[0]
        df_test[col].fillna(mode_val, inplace=True)
    elif df_test[col].dtype in ['int64', 'float64']:
        mean_val = df_test[col].mean()
        df_test[col].fillna(mean_val, inplace=True)

# Verify if there are any remaining missing values
print("Missing values after handling:\n", df_test.isnull().sum())

# %%
df_test.describe()

# %% [markdown]
# ## Feature Engineering

# %%
# Feature engineering
df_test['trans_Date'] = df_test.trans_date_trans_time.dt.date
df_test['trans_Time'] = df_test.trans_date_trans_time.dt.time
df_test['trans_Date'] = pd.to_datetime(df_test['trans_Date'], format='%Y-%m-%d')
df_test['age'] = df_test['dob'].apply(lambda x: (pd.Timestamp.now().year - x.year))

# Drop unnecessary columns
df_test = df_test.drop(['trans_date_trans_time', 'dob', 'first', 'last', 'unix_time', 'cc_num', 'trans_num'], axis=1)

# Convert categorical columns to numeric using Label Encoding
label_encoder = LabelEncoder()
for col in ['state', 'job', 'merchant', 'city', 'street', 'category']:
    df_test[col] = label_encoder.fit_transform(df_test[col])

# Convert date columns to numeric (timestamp)
df_test['trans_Date'] = df_test['trans_Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
df_test['trans_Time'] = df_test['trans_Time'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second if pd.notnull(x) else np.nan)


Missing values after handling:
 trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [26]:
df_test

Unnamed: 0,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,trans_Date,trans_Time,age
0,319,10,2.86,1,342,157,39,29209,33.9659,-80.9355,333497,275,33.986391,-81.200714,0,1.592698e+09,44040,56
1,591,10,29.84,0,354,16,43,84002,40.3207,-110.4360,302,392,39.450498,-109.960431,0,1.592698e+09,44040,34
2,611,5,41.28,0,864,61,33,11710,40.6729,-73.5365,34496,259,40.495810,-74.196111,0,1.592698e+09,44040,54
3,223,9,60.05,1,318,764,8,32780,28.5697,-80.8191,54767,407,28.812398,-80.883061,0,1.592698e+09,44100,37
4,291,13,3.19,1,548,247,21,49632,44.2529,-85.0170,1126,196,44.959148,-85.884734,0,1.592698e+09,44100,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,506,5,43.77,1,531,443,23,63453,40.4931,-91.8912,519,460,39.946837,-91.333331,0,1.609373e+09,86340,58
555715,264,7,111.84,1,541,401,42,77566,29.0393,-95.4401,28739,198,29.661049,-96.186633,0,1.609373e+09,86340,25
555716,496,7,86.88,0,128,104,46,99323,46.1966,-118.9017,3684,292,46.658340,-119.715054,0,1.609373e+09,86340,43
555717,76,13,7.99,1,662,476,12,83643,44.6255,-116.4493,129,58,44.470525,-117.080888,0,1.609373e+09,86340,59


In [27]:
# Split the data into features and target variable
X_final = df_test.drop('is_fraud', axis=1)
y_final = df_test['is_fraud']


In [28]:
X_final

Unnamed: 0,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,trans_Date,trans_Time,age
0,319,10,2.86,1,342,157,39,29209,33.9659,-80.9355,333497,275,33.986391,-81.200714,1.592698e+09,44040,56
1,591,10,29.84,0,354,16,43,84002,40.3207,-110.4360,302,392,39.450498,-109.960431,1.592698e+09,44040,34
2,611,5,41.28,0,864,61,33,11710,40.6729,-73.5365,34496,259,40.495810,-74.196111,1.592698e+09,44040,54
3,223,9,60.05,1,318,764,8,32780,28.5697,-80.8191,54767,407,28.812398,-80.883061,1.592698e+09,44100,37
4,291,13,3.19,1,548,247,21,49632,44.2529,-85.0170,1126,196,44.959148,-85.884734,1.592698e+09,44100,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,506,5,43.77,1,531,443,23,63453,40.4931,-91.8912,519,460,39.946837,-91.333331,1.609373e+09,86340,58
555715,264,7,111.84,1,541,401,42,77566,29.0393,-95.4401,28739,198,29.661049,-96.186633,1.609373e+09,86340,25
555716,496,7,86.88,0,128,104,46,99323,46.1966,-118.9017,3684,292,46.658340,-119.715054,1.609373e+09,86340,43
555717,76,13,7.99,1,662,476,12,83643,44.6255,-116.4493,129,58,44.470525,-117.080888,1.609373e+09,86340,59


In [29]:
y_final

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

In [30]:
# Predict on the test set
y_pred_xgb_best_final = xgb_model_best.predict(X_final)

# Evaluate the optimized XGBoost model
print("Optimized XGBoost Classification Report:\n", classification_report(y_final, y_pred_xgb_best_final))
print("Optimized XGBoost Confusion Matrix:\n", confusion_matrix(y_final, y_pred_xgb_best_final))

Optimized XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.28      0.65      0.39      2145

    accuracy                           0.99    555719
   macro avg       0.64      0.82      0.69    555719
weighted avg       1.00      0.99      0.99    555719

Optimized XGBoost Confusion Matrix:
 [[549944   3630]
 [   746   1399]]
