In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,roc_auc_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
X_train=pd.read_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data\X_train.csv')
X_test=pd.read_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data\X_test.csv')
y_train=pd.read_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data\y_train.csv')
y_test=pd.read_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data\y_test.csv')

In [3]:
# Random Forest Hyperparameter Tuning
rf_params={
    'n_estimators':[100,200],
    'max_depth':[10,20],
    'min_samples_split':[2,5]
}

In [4]:
rf_grid=GridSearchCV(RandomForestClassifier(n_estimators=100,random_state=42),rf_params,cv=5,scoring='roc_auc',n_jobs=-1)
rf_grid.fit(X_train,y_train)

print(f'Best Random Forest Parameter :{rf_grid.best_params_}')
print(f'Best Random Forest Score :{rf_grid.best_score_}')

Best Random Forest Parameter :{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Score :0.9999960680947065


In [5]:
xgb_params={
    'learning_rate':[0.01,0.1],
    'n_estimators':[100,200],
    'max_depth':[3,6]
}

In [6]:
xgb_grid=GridSearchCV(XGBClassifier(use_label_encoder=False,eval_metric="logloss"),xgb_params,cv=5,scoring='roc_auc',n_jobs=-1)
xgb_grid.fit(X_train,y_train)

print(f'Best XGBoost Parameter :{xgb_grid.best_params_}')
print(f'Best XGBoost Score :{xgb_grid.best_score_}')

Best XGBoost Parameter :{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}
Best XGBoost Score :0.9999787749222048


In [7]:
lgbm_params={
    'learning_rate':[0.05,0.1],
    'n_estimators':[100,200],
    'max_depth':[2,6],
    'num_leaves': [3, 31, 63]
}

In [8]:
lgbm_grid=GridSearchCV(LGBMClassifier(),lgbm_params,cv=5,scoring='roc_auc',n_jobs=-1)
lgbm_grid.fit(X_train,y_train)

print(f'Best LightGBM Parameters : {lgbm_grid.best_params_}')
print(f'Best LightGBM Score : {lgbm_grid.best_score_}')

[LightGBM] [Info] Number of positive: 227452, number of negative: 227452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 454904, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best LightGBM Parameters : {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'num_leaves': 63}
Best LightGBM Score : 0.9999698378733395


In [9]:
# Since the best model is Random Forest as per score we will consider that model
best_model=rf_grid.best_estimator_
y_pred=best_model.predict(X_test)

In [10]:
# Classification Report
print(f'Final Model Classification Report : \n {classification_report(y_test,y_pred)}')

# ROC-AUC Score
print(f'Final Model ROC-AUC Score : \n {roc_auc_score(y_test,y_pred)}')

# Confusion Matrix
print(f'Confusion Matrix :\n{confusion_matrix(y_test,y_pred)}')

Final Model Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726

Final Model ROC-AUC Score : 
 0.9997713803351915
Confusion Matrix :
[[56837    26]
 [    0 56863]]


In [11]:
# Creating a DataFrame with Results
df_results=X_test.copy()
df_results['Actual_Class']=y_test
df_results['Predicted_Class']=y_pred

In [12]:
# Since our data is scaled we cant use for visualization so we have to inverse it
# Load the previously saved scaler for 'Amount'
scaler_amount = joblib.load('D:/Projects/DATA_ANALYST_PROJECTS/Financial_Fraud_Detection/models/scaler_amount.pkl')

# Now inverse transform
df_results['Amount']=scaler_amount.inverse_transform(df_results[['Amount']])

In [13]:
# We have removed Time Column while splitting Input and Output Variable so we have to retrieve it from Original DataFrame by Index number
original_df=pd.read_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data\creditcard.csv')

df_results['Time']=original_df.loc[X_test.index,'Time']

In [14]:
df_results.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,Actual_Class,Predicted_Class,Time
0,-6.52221,5.008238,-4.368653,4.214256,0.803043,0.00473,-4.291936,-5.703768,0.40139,-0.745525,...,-0.081621,0.120518,0.35695,0.060909,0.254421,1.585409,1.389922,1,1,0.0
1,-2.609581,2.272322,-3.721237,1.289706,-1.293575,-1.152602,-1.751268,0.358727,-1.804721,-4.879523,...,-0.227636,-0.488322,0.15044,0.057362,0.111689,0.073212,164.375459,1,1,0.0
2,-2.227072,2.024517,1.202342,0.375396,-1.032798,0.752067,-0.889875,0.813497,1.066844,1.721723,...,-0.038849,-0.326955,-0.134761,-0.223043,-0.147126,0.291746,1.42,0,0,1.0
3,-3.791614,-3.063326,1.728037,0.827653,0.146182,-1.089036,-0.69824,0.838687,0.724689,-1.318997,...,0.486717,0.384004,0.412982,0.218452,0.11292,-0.3693,302.4,0,0,1.0
4,-11.002943,7.832543,-13.825106,4.324401,-8.35454,-3.494746,-8.848028,7.578921,-2.344203,-5.948035,...,-0.266381,-0.028522,0.628581,-0.263209,1.091451,0.21405,99.99,1,1,2.0


In [15]:
# Save as CSV for PowerBI
df_results.to_csv('D:\Projects\DATA_ANALYST_PROJECTS\Financial_Fraud_Detection\data/Fraud_Detection_result.csv',index=False)