### Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import joblib

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

In [4]:
TRAIN_DATASET_PATH = "Dataset/fraudTrain.csv"
TEST_DATASET_PATH = "Dataset/fraudTest.csv"

In [5]:
def loadData(path):
    if(os.path.exists(path)):
        return pd.read_csv(path)

### Training data preparation

In [6]:
Training_Data = loadData(TRAIN_DATASET_PATH)
Training_Data = Training_Data.drop(columns=["Unnamed: 0"])

In [7]:
def getColumns(dataframe:pd.DataFrame):
    columns = list(dataframe.columns)
    if "is_fraud" in columns:
        columns.remove("is_fraud")
    categorical_columns = set()
    for col in columns:
        if dataframe[col].dtype == np.dtype('object'):
            categorical_columns.add(col)
    return (list(set(columns).difference(categorical_columns)) , list(categorical_columns))

In [8]:
class customTransformer:

    def __init__(self , data:pd.DataFrame):
        self.data = data
        self.Numerical_Columns = None
        self.Categorical_Columns = None
        self.Target = "is_fraud"

    def createNumericalFeatures(self):
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['unix_time'], unit='s')
        self.data['dob'] = pd.to_datetime(self.data['dob'])
        # Time-based features
        self.data['hour'] = self.data['trans_date_trans_time'].dt.hour
        self.data['age'] = self.data['trans_date_trans_time'].dt.year - self.data['dob'].dt.year
        self.data['is_night_transaction'] = self.data['hour'].apply(lambda x: 1 if (0 <= x <= 5 or x >= 20) else 0)
        # 1. Transaction Amount Ranges (Bins 200-400, 600-1200)
        self.data['amt_200_400'] = self.data['amt'].apply(lambda x: 1 if 200 <= x <= 400 else 0)
        self.data['amt_600_1200'] = self.data['amt'].apply(lambda x: 1 if 600 <= x <= 1200 else 0)
    
    def removeNonRelevantFeatures(self):
        columns = set(self.data.columns)
        relevant_cols =set(["is_fraud","amt_600_1200","amt","amt_200_400",'is_night_transaction',"hour","age",'merchant', 'city', 'job', 'state' ,'category'])
        cols_to_drop = columns.difference(relevant_cols)
        self.data = self.data.drop(columns=cols_to_drop)
        self.Numerical_Columns , self.Categorical_Columns = getColumns(self.data)

    def transform(self):
        self.createNumericalFeatures()
        self.removeNonRelevantFeatures()

In [9]:
Numerical_Columns , Categorical_Columns =  getColumns(Training_Data)

In [10]:
data_preprocessing = customTransformer(Training_Data)
data_preprocessing.transform()

In [11]:
print("Numerical features : " , data_preprocessing.Numerical_Columns)
print("Categorical features : " , data_preprocessing.Categorical_Columns)
print("Target class : " , data_preprocessing.Target)

Numerical features :  ['amt_600_1200', 'hour', 'amt', 'age', 'amt_200_400', 'is_night_transaction']
Categorical features :  ['merchant', 'state', 'city', 'category', 'job']
Target class :  is_fraud


In [12]:
with open("dataset.pkl" , 'wb') as f:
    pickle.dump(data_preprocessing , f)

### Metrics

In [13]:
def metrics(y_test , y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    # Precision
    precision = precision_score(y_test, y_pred)
    print(f"Precision: {precision}")

    # Recall
    recall = recall_score(y_test, y_pred)
    print(f"Recall: {recall}")

    # F1 Score
    f1 = f1_score(y_test, y_pred)
    print(f"F1 Score: {f1}")

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # ROC-AUC Score
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC-AUC Score: {roc_auc}")

    # Classification Report
    report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)

    return (precision, recall, f1)

### Custom Pipeline

In [None]:
with open("dataset.pkl" , 'rb') as f:
    data_preprocessing = pickle.load(f)

In [14]:
data_preprocessing.data.is_fraud.value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

In [15]:
data_preprocessing.data.head()

Unnamed: 0,merchant,category,amt,city,state,job,is_fraud,hour,age,is_night_transaction,amt_200_400,amt_600_1200
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Moravian Falls,NC,"Psychologist, counselling",0,0,24,1,0,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,Special educational needs teacher,0,0,34,1,0,0
2,fraud_Lind-Buckridge,entertainment,220.11,Malad City,ID,Nature conservation officer,0,0,50,1,1,0
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Boulder,MT,Patent attorney,0,0,45,1,0,0
4,fraud_Keeling-Crist,misc_pos,41.96,Doe Hill,VA,Dance movement psychotherapist,0,0,26,1,0,0


In [16]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from category_encoders import BinaryEncoder
def createPreprocessingPipeline():
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary_encoder', BinaryEncoder(), data_preprocessing.Categorical_Columns), 
            ('numeric', numeric_transformer, data_preprocessing.Numerical_Columns)
        ],
        remainder='passthrough'  # Pass through the remaining numeric columns
    )
    
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
    ])

# Create preprocessing pipeline
preprocessing_pipeline = createPreprocessingPipeline()

In [17]:
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline

def createSMOTEPipeline():
    return ImbPipeline(steps=[
        ('adasyn', ADASYN(sampling_strategy='minority', random_state=42, n_neighbors=5))
    ])

# Create ADASYN pipeline fot Training Data only
smote_pipeline = createSMOTEPipeline()

In [18]:
X_train, X_val, y_train, y_val = train_test_split(data_preprocessing.data.drop('is_fraud', axis=1), data_preprocessing.data['is_fraud'], test_size=0.2, random_state=42)

In [19]:
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train, y_train)

In [20]:
X_train_resampled, y_train_resampled = smote_pipeline.fit_resample(X_train_preprocessed, y_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_val)

In [21]:
X_train_resampled[20]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  1.        , -0.08306273,
        1.055153  ,  0.06074485, -0.00138173, -0.17917627,  1.22179084])

In [22]:
preprocessing_pipeline.get_feature_names_out()

array(['binary_encoder__merchant_0', 'binary_encoder__merchant_1',
       'binary_encoder__merchant_2', 'binary_encoder__merchant_3',
       'binary_encoder__merchant_4', 'binary_encoder__merchant_5',
       'binary_encoder__merchant_6', 'binary_encoder__merchant_7',
       'binary_encoder__merchant_8', 'binary_encoder__merchant_9',
       'binary_encoder__state_0', 'binary_encoder__state_1',
       'binary_encoder__state_2', 'binary_encoder__state_3',
       'binary_encoder__state_4', 'binary_encoder__state_5',
       'binary_encoder__city_0', 'binary_encoder__city_1',
       'binary_encoder__city_2', 'binary_encoder__city_3',
       'binary_encoder__city_4', 'binary_encoder__city_5',
       'binary_encoder__city_6', 'binary_encoder__city_7',
       'binary_encoder__city_8', 'binary_encoder__city_9',
       'binary_encoder__category_0', 'binary_encoder__category_1',
       'binary_encoder__category_2', 'binary_encoder__category_3',
       'binary_encoder__job_0', 'binary_encoder__job_

### Training different models

In [23]:
validation_results = dict()
testing_results = dict()

#### Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_rf = rf_classifier.predict(X_test_preprocessed)
precision , recall , f1 = metrics(y_val , y_pred_rf)
validation_results["RandomForest"] = [precision, recall, f1]
joblib.dump(rf_classifier, 'Dataset/Models/rf_classifier.joblib')

Accuracy: 0.9973701968496347
Precision: 0.9028846153846154
Recall: 0.6177631578947368
F1 Score: 0.73359375
Confusion Matrix:
[[257714    101]
 [   581    939]]
ROC-AUC Score: 0.8086857020589019
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.90      0.62      0.73      1520

    accuracy                           1.00    259335
   macro avg       0.95      0.81      0.87    259335
weighted avg       1.00      1.00      1.00    259335



['Dataset/Models/rf_classifier.joblib']

#### XGB 

In [48]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier( use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_classifier.predict(X_test_preprocessed)
precision , recall , f1  = metrics(y_val , y_pred_xgb)
validation_results["XGBoost"] = [precision, recall, f1]

Accuracy: 0.9984614494765458
Precision: 0.9006433166547534
Recall: 0.8289473684210527
F1 Score: 0.8633093525179856
Confusion Matrix:
[[257676    139]
 [   260   1260]]
ROC-AUC Score: 0.9142041110669932
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.90      0.83      0.86      1520

    accuracy                           1.00    259335
   macro avg       0.95      0.91      0.93    259335
weighted avg       1.00      1.00      1.00    259335



Grid search on XGB 

In [49]:
from sklearn.model_selection import GridSearchCV
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
}

grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=2, verbose=1, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


In [50]:
print("Best Parameters:", grid_search.best_params_)
best_xgb_clf = grid_search.best_estimator_
y_pred = best_xgb_clf.predict(X_test_preprocessed)
precision, recall, f1 = metrics(y_val, y_pred)
validation_results["XGBoost_Best"] = [precision, recall, f1]

Best Parameters: {'learning_rate': 0.2, 'n_estimators': 200}
Accuracy: 0.9985231457381379
Precision: 0.9034776437189496
Recall: 0.8375
F1 Score: 0.8692386480027313
Confusion Matrix:
[[257679    136]
 [   247   1273]]
ROC-AUC Score: 0.9184862449818668
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.90      0.84      0.87      1520

    accuracy                           1.00    259335
   macro avg       0.95      0.92      0.93    259335
weighted avg       1.00      1.00      1.00    259335



#### Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
from collections import Counter

weights = dict(Counter(Training_Data["is_fraud"]))

total_instances = weights[0]+weights[1]
weights[0] = (total_instances)/weights[0]
weights[1] = (total_instances)/weights[1]

dt_classifier = DecisionTreeClassifier(class_weight=weights , random_state=42)
dt_classifier.fit(X_train_resampled, y_train_resampled)
y_pred_dt = dt_classifier.predict(X_test_preprocessed)

precision , recall , f1  = metrics(y_val , y_pred_dt)
validation_results["DecisionTree"] = [precision, recall, f1]

Accuracy: 0.9970038752964313
Precision: 0.7217910447761194
Recall: 0.7953947368421053
F1 Score: 0.7568075117370892
Confusion Matrix:
[[257349    466]
 [   311   1209]]
ROC-AUC Score: 0.8967936196089199
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.72      0.80      0.76      1520

    accuracy                           1.00    259335
   macro avg       0.86      0.90      0.88    259335
weighted avg       1.00      1.00      1.00    259335



Grid Search on decision tree

In [31]:
from sklearn.model_selection import GridSearchCV

dt_C = DecisionTreeClassifier(class_weight=weights , random_state=42)

# Set up parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
}

grid_search = GridSearchCV(estimator=dt_C, param_grid=param_grid, cv=2, scoring='accuracy', verbose=1, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 20}
Best Score: 0.9780152659914314


In [32]:
best_dt_clf = grid_search.best_estimator_
y_pred_dt_best = best_dt_clf.predict(X_test_preprocessed)
precision , recall , f1 =  metrics(y_val , y_pred_dt_best)
validation_results["DecisionTree_Best"] = [precision , recall , f1]

Accuracy: 0.9940925829525518
Precision: 0.4976190476190476
Recall: 0.825
F1 Score: 0.6207920792079208
Confusion Matrix:
[[256549   1266]
 [   266   1254]]
ROC-AUC Score: 0.9100447510812018
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.50      0.82      0.62      1520

    accuracy                           0.99    259335
   macro avg       0.75      0.91      0.81    259335
weighted avg       1.00      0.99      0.99    259335



### Testing Data Preparation

In [33]:
Testing_Data = loadData(TEST_DATASET_PATH)
Testing_Data = Testing_Data.drop(columns=["Unnamed: 0"])

In [34]:
test_data = customTransformer(Testing_Data)
test_data.transform()

In [35]:
testing_data = test_data.data.drop("is_fraud" , axis=1)
test_preprocessed = preprocessing_pipeline.transform(testing_data)
test_solution = test_data.data["is_fraud"]

### Evaluation of testing data

In [51]:
def evaluateTestData(model , model_name):
    test_prediction = model.predict(test_preprocessed)
    precision, recall , f1 = metrics(test_solution , test_prediction)
    testing_results[model_name] = [precision, recall , f1]

In [52]:
evaluateTestData(rf_classifier , "RandomForest")

Accuracy: 0.996102346689604
Precision: 0.043478260869565216
Recall: 0.0004662004662004662
F1 Score: 0.0009225092250922509
Confusion Matrix:
[[553552     22]
 [  2144      1]]
ROC-AUC Score: 0.5002132293576617
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.04      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.52      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [53]:
evaluateTestData(xgb_classifier , "XGBoost")

Accuracy: 0.9985226346408886
Precision: 0.843717549325026
Recall: 0.7575757575757576
F1 Score: 0.7983296487349546
Confusion Matrix:
[[553273    301]
 [   520   1625]]
ROC-AUC Score: 0.8785160090830154
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.84      0.76      0.80      2145

    accuracy                           1.00    555719
   macro avg       0.92      0.88      0.90    555719
weighted avg       1.00      1.00      1.00    555719



In [54]:
evaluateTestData(best_xgb_clf , "XGBoost_Best")

Accuracy: 0.9985496266998249
Precision: 0.8544203282159873
Recall: 0.7524475524475525
F1 Score: 0.8001983143282102
Confusion Matrix:
[[553299    275]
 [   531   1614]]
ROC-AUC Score: 0.8759753902807949
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.85      0.75      0.80      2145

    accuracy                           1.00    555719
   macro avg       0.93      0.88      0.90    555719
weighted avg       1.00      1.00      1.00    555719



In [55]:
evaluateTestData(dt_classifier, "DecisionTree")

Accuracy: 0.9972252163413524
Precision: 0.6187475384009452
Recall: 0.7324009324009324
F1 Score: 0.670794192997438
Confusion Matrix:
[[552606    968]
 [   574   1571]]
ROC-AUC Score: 0.8653261476811717
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.62      0.73      0.67      2145

    accuracy                           1.00    555719
   macro avg       0.81      0.87      0.83    555719
weighted avg       1.00      1.00      1.00    555719



In [56]:
evaluateTestData(best_dt_clf , "DecisionTree_Best")

Accuracy: 0.9941949078581082
Precision: 0.37873008750280457
Recall: 0.786946386946387
F1 Score: 0.5113601938806422
Confusion Matrix:
[[550805   2769]
 [   457   1688]]
ROC-AUC Score: 0.8909721728327732
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.38      0.79      0.51      2145

    accuracy                           0.99    555719
   macro avg       0.69      0.89      0.75    555719
weighted avg       1.00      0.99      1.00    555719



### Comparing all models

In [61]:
val_results = pd.DataFrame.from_dict(validation_results , orient='index')
test_results = pd.DataFrame.from_dict(testing_results , orient='index')

In [62]:
columns = ['Precision', 'Recall', 'F1']
val_results.index = list(validation_results.keys())
val_results.columns = columns
test_results.index = list(testing_results.keys())
test_results.columns = columns

In [63]:
val_results

Unnamed: 0,Precision,Recall,F1
RandomForest,0.902885,0.617763,0.733594
XGBoost,0.900643,0.828947,0.863309
XGBoost_Best,0.903478,0.8375,0.869239
DecisionTree,0.721791,0.795395,0.756808
DecisionTree_Best,0.497619,0.825,0.620792


In [64]:
test_results

Unnamed: 0,Precision,Recall,F1
XGBoost,0.843718,0.757576,0.79833
XGBoost_Best,0.85442,0.752448,0.800198
RandomForest,0.043478,0.000466,0.000923
DecisionTree,0.618748,0.732401,0.670794
DecisionTree_Best,0.37873,0.786946,0.51136
