In [34]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [35]:
df = pd.read_csv('raw_data.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.sample(5)

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
3658,0,0.4,0.107253,121.0,21,30,0.025142,13.423145,AA,1503.0,4758.576721,6822.506156,6687.497576,39,11,CA,53,0.0,BC,0,0,28,,200.0,0.0,INTERNET,46.051438,other,1,1,0,0
806217,0,0.9,0.251269,-1.0,205,50,0.021359,-1.488786,AC,587.0,3422.95712,2558.529483,4324.110085,0,5,CA,201,0.0,BE,0,1,-1,1.0,500.0,0.0,INTERNET,50.182321,windows,1,1,0,6
911233,0,0.2,,-1.0,34,20,0.000178,-1.26574,AC,1176.0,4641.493717,2595.29107,3122.983299,0,4,CB,117,0.0,BE,0,1,-1,0.0,200.0,0.0,INTERNET,3.175939,,0,1,0,7
475157,0,,0.631089,94.0,19,30,0.021439,-0.072525,AC,2445.0,5914.122333,5188.25327,5101.730131,1,20,,152,0.0,BC,0,1,-1,0.0,1000.0,,INTERNET,4.737515,windows,1,1,0,3
514356,0,0.7,0.531417,,43,50,0.019497,-1.17415,AB,1500.0,5587.453063,5004.210703,4863.498248,1854,8,,324,,BB,0,1,11,1.0,1500.0,0.0,INTERNET,11.839913,other,0,1,0,3


In [36]:
df.shape

(1000000, 32)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            845429 non-null   float64
 2   name_email_similarity             878761 non-null   float64
 3   prev_address_months_count         537360 non-null   float64
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            859130 non-null   float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      676304 non-null   float64
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [38]:
df['fraud_bool'].value_counts()

fraud_bool
0    988971
1     11029
Name: count, dtype: int64

In [39]:
missing_percent = df.isnull().mean() * 100

missing_data = missing_percent[missing_percent > 0]
missing_data = missing_data.sort_values(ascending=False)

In [40]:
missing_data

prev_address_months_count    46.2640
zip_count_4w                 32.3696
employment_status            30.0112
device_os                    29.3869
foreign_request              21.8423
email_is_free                17.5733
income                       15.4571
intended_balcon_amount       14.0870
name_email_similarity        12.1239
has_other_cards              10.9091
dtype: float64

In [41]:
obj_cols = [i for i in df.columns if df[i].dtypes == 'object']
int_cols = [i for i in df.columns if df[i].dtypes != 'object' and i!='fraud_bool']

In [42]:
int_cols

['income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [43]:
## Missing values in int

missing_percent_int = df[int_cols].isnull().mean() * 100

missing_data_int = missing_percent_int[missing_percent_int > 0]
missing_data_int = missing_data_int.sort_values(ascending=False)

In [44]:
missing_data_int

prev_address_months_count    46.2640
zip_count_4w                 32.3696
foreign_request              21.8423
email_is_free                17.5733
income                       15.4571
intended_balcon_amount       14.0870
name_email_similarity        12.1239
has_other_cards              10.9091
dtype: float64

In [45]:
## Missing values in categorical

missing_percent_obj = df[obj_cols].isnull().mean() * 100

missing_data_obj = missing_percent_obj[missing_percent_obj > 0]
missing_data_obj = missing_percent_obj.sort_values(ascending=False)

In [46]:
missing_data_obj

employment_status    30.0112
device_os            29.3869
payment_type          0.0000
housing_status        0.0000
source                0.0000
dtype: float64

In [47]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [48]:
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean'))
])

# categorical pipeline

categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe_impute', OneHotEncoder())
])

In [49]:
preprocessor =ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,int_cols),
    ('categorical_pipeline',categorical_pipeline,obj_cols)
])

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [52]:
y_train = df_train[['fraud_bool']]
X_train = df_train.drop(['fraud_bool'],axis=1)

y_test = df_test[['fraud_bool']]
X_test = df_test.drop(['fraud_bool'],axis=1)

In [53]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [55]:
models = {
                "Random Forest": RandomForestClassifier(n_jobs = -1,class_weight={0:1, 1:90}),
                "Decision Tree": DecisionTreeClassifier(class_weight={0:1, 1:90}),
                "Logistic": LogisticRegression(n_jobs = -1,class_weight={0:1, 1:90}),
                "Gradient Boosting": GradientBoostingClassifier(),
                "XGBoost Classifier": XGBClassifier(n_jobs = -1,class_weight={0:1, 1:90})
            }

In [60]:
from sklearn.metrics import recall_score, classification_report

In [57]:
## Dagshub setup
import dagshub
import mlflow
dagshub.init(repo_owner='SrijanDeo-DA-DS', repo_name='bank-fraud-detection-with-deployment', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow')

mlflow.set_experiment("All models Baseline")

2024/10/24 23:15:59 INFO mlflow.tracking.fluent: Experiment with name 'All models Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/18dd15af40de49d1808ac5e0ec5d1663', creation_time=1729829789483, experiment_id='2', last_update_time=1729829789483, lifecycle_stage='active', name='All models Baseline', tags={}>

In [62]:
results_training = []
results_test = []

with mlflow.start_run(run_name="All experiments") as parent_run:
    for name, model in models.items():

        with mlflow.start_run(run_name=f"{name}", nested=True) as child_run:

            ## log preprocessing paramters
            mlflow.log_param("test_size",0.2)
    
            ## log model
            mlflow.log_param("model", name)
    
            if name == "RandomForestClassifier":
                mlflow.log_param("class_weight", model.class_weight)
            elif name == "DecisionTreeClassifier":
                mlflow.log_param("class_weight", model.class_weight)
            elif name == "LogisticRegression":
                mlflow.log_param("class_weight", model.class_weight)
            elif name == "XGBClassifier":
                mlflow.log_param("class_weight", model.class_weight)
            
            model.fit(X_train, y_train)
            y_pred_test = model.predict(X_test)
        
            ##for underfitting
            y_pred_train = model.predict(X_train)
        
            # recall score
            recall_test = recall_score(y_test, y_pred_test)
            recall_train = recall_score(y_train, y_pred_train)
        
            # logging recall (can log other metrics too)
            mlflow.log_metric("recall-train-data", recall_train)
            mlflow.log_metric("recall-test-data", recall_test)
            
            results_training.append({"Model": name, "Recall": recall_train})
            results_test.append({"Model": name, "Recall": recall_test})
        
            # classification report
            print("Train data classification report for- ",name,classification_report(y_train, y_pred_train))
            print("Test data classification report - ",name, classification_report(y_test, y_pred_test))
        
            # log model
            mlflow.sklearn.log_model(model, "model")
        
            # log notebook
            import os
            notebook_path = "exp2_baseline_all_model.ipynb"
            os.system(f"Jupyer nbconvert --to notebook --execute --inplace {notebook_path}")
            mlflow.log_artifact(notebook_path)

    
results_df_test = pd.DataFrame(results_test)

results_df_training = pd.DataFrame(results_training)

Train data classification report for-  Random Forest               precision    recall  f1-score   support

           0       1.00      1.00      1.00    791256
           1       1.00      1.00      1.00      8744

    accuracy                           1.00    800000
   macro avg       1.00      1.00      1.00    800000
weighted avg       1.00      1.00      1.00    800000

Test data classification report -  Random Forest               precision    recall  f1-score   support

           0       0.99      1.00      0.99    197715
           1       0.00      0.00      0.00      2285

    accuracy                           0.99    200000
   macro avg       0.49      0.50      0.50    200000
weighted avg       0.98      0.99      0.98    200000



2024/10/24 23:27:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/39a38218566d449c8f7f6020516e1754.
2024/10/24 23:27:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.


Train data classification report for-  Decision Tree               precision    recall  f1-score   support

           0       1.00      1.00      1.00    791256
           1       1.00      1.00      1.00      8744

    accuracy                           1.00    800000
   macro avg       1.00      1.00      1.00    800000
weighted avg       1.00      1.00      1.00    800000

Test data classification report -  Decision Tree               precision    recall  f1-score   support

           0       0.99      0.99      0.99    197715
           1       0.07      0.07      0.07      2285

    accuracy                           0.98    200000
   macro avg       0.53      0.53      0.53    200000
weighted avg       0.98      0.98      0.98    200000



2024/10/24 23:28:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/7c9adddd8bda4710a946a652d1313816.
2024/10/24 23:28:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.


Train data classification report for-  Logistic               precision    recall  f1-score   support

           0       0.99      0.67      0.80    791256
           1       0.02      0.67      0.04      8744

    accuracy                           0.67    800000
   macro avg       0.51      0.67      0.42    800000
weighted avg       0.98      0.67      0.79    800000

Test data classification report -  Logistic               precision    recall  f1-score   support

           0       0.99      0.67      0.80    197715
           1       0.02      0.66      0.04      2285

    accuracy                           0.67    200000
   macro avg       0.51      0.67      0.42    200000
weighted avg       0.98      0.67      0.79    200000



2024/10/24 23:28:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/e6a9c499d4584bc28dc7ca85a243a7be.
2024/10/24 23:28:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.


Train data classification report for-  Gradient Boosting               precision    recall  f1-score   support

           0       0.99      1.00      0.99    791256
           1       0.58      0.03      0.05      8744

    accuracy                           0.99    800000
   macro avg       0.78      0.51      0.52    800000
weighted avg       0.98      0.99      0.98    800000

Test data classification report -  Gradient Boosting               precision    recall  f1-score   support

           0       0.99      1.00      0.99    197715
           1       0.62      0.03      0.05      2285

    accuracy                           0.99    200000
   macro avg       0.80      0.51      0.52    200000
weighted avg       0.98      0.99      0.98    200000



2024/10/24 23:41:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Gradient Boosting at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/e7edfb92d3cf4d589e0cf951af96a03b.
2024/10/24 23:41:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.


Train data classification report for-  XGBoost Classifier               precision    recall  f1-score   support

           0       0.99      1.00      0.99    791256
           1       0.83      0.09      0.16      8744

    accuracy                           0.99    800000
   macro avg       0.91      0.55      0.58    800000
weighted avg       0.99      0.99      0.99    800000

Test data classification report -  XGBoost Classifier               precision    recall  f1-score   support

           0       0.99      1.00      0.99    197715
           1       0.51      0.04      0.08      2285

    accuracy                           0.99    200000
   macro avg       0.75      0.52      0.54    200000
weighted avg       0.98      0.99      0.98    200000



2024/10/24 23:42:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost Classifier at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/ad8f503d2f824eacbc3bc8159d901ee3.
2024/10/24 23:42:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.
2024/10/24 23:42:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run All experiments at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2/runs/724fd6b39a42440b943c7b44f9c87134.
2024/10/24 23:42:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/2.


In [63]:
results_df_test

Unnamed: 0,Model,Recall
0,Random Forest,0.0
1,Decision Tree,0.073085
2,Logistic,0.663457
3,Gradient Boosting,0.026258
4,XGBoost Classifier,0.042013


In [64]:
results_df_training

Unnamed: 0,Model,Recall
0,Random Forest,0.997941
1,Decision Tree,1.0
2,Logistic,0.66903
3,Gradient Boosting,0.027219
4,XGBoost Classifier,0.090233
