In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('raw_data.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.sample(5)

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
127883,0,0.6,0.782752,-1.0,64,40,0.012145,34.131559,AA,2197.0,8593.820696,7999.095085,6354.156485,14,5,CA,61,0.0,BE,1,1,20,0.0,1500.0,0.0,INTERNET,4.676838,,1,1,0,0
781058,0,0.9,0.280203,-1.0,109,30,0.015137,-1.076745,AB,,6010.504887,3025.726592,4235.403121,1811,6,CA,113,1.0,BC,1,0,21,0.0,200.0,0.0,INTERNET,1.95149,macintosh,1,1,0,5
788030,0,,0.90573,,53,40,0.030347,-0.713398,AB,,3133.246654,6903.735574,4234.28412,20,3,CA,166,1.0,BC,0,1,1,1.0,200.0,0.0,INTERNET,11.283453,windows,0,1,0,5
79777,0,0.7,0.861901,-1.0,366,50,0.008905,11.96735,AA,3367.0,8300.298054,6611.115092,5910.213222,2058,5,CA,316,0.0,BA,1,0,20,1.0,1500.0,0.0,INTERNET,4.291204,linux,0,1,0,0
943830,0,0.4,0.912287,52.0,23,30,0.018906,-0.714729,AB,731.0,6835.702075,6371.573444,5649.737663,12,9,CA,227,0.0,BC,0,1,6,0.0,1000.0,,INTERNET,9.274842,other,1,1,0,7


In [3]:
df.shape

(1000000, 32)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            845429 non-null   float64
 2   name_email_similarity             878761 non-null   float64
 3   prev_address_months_count         537360 non-null   float64
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            859130 non-null   float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      676304 non-null   float64
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [5]:
df['fraud_bool'].value_counts()

fraud_bool
0    988971
1     11029
Name: count, dtype: int64

In [6]:
missing_percent = df.isnull().mean() * 100

missing_data = missing_percent[missing_percent > 0]
missing_data = missing_data.sort_values(ascending=False)

In [7]:
missing_data

prev_address_months_count    46.2640
zip_count_4w                 32.3696
employment_status            30.0112
device_os                    29.3869
foreign_request              21.8423
email_is_free                17.5733
income                       15.4571
intended_balcon_amount       14.0870
name_email_similarity        12.1239
has_other_cards              10.9091
dtype: float64

In [8]:
obj_cols = [i for i in df.columns if df[i].dtypes == 'object']
int_cols = [i for i in df.columns if df[i].dtypes != 'object' and i!='fraud_bool']

In [9]:
int_cols

['income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [10]:
## Missing values in int

missing_percent_int = df[int_cols].isnull().mean() * 100

missing_data_int = missing_percent_int[missing_percent_int > 0]
missing_data_int = missing_data_int.sort_values(ascending=False)

In [11]:
missing_data_int

prev_address_months_count    46.2640
zip_count_4w                 32.3696
foreign_request              21.8423
email_is_free                17.5733
income                       15.4571
intended_balcon_amount       14.0870
name_email_similarity        12.1239
has_other_cards              10.9091
dtype: float64

In [12]:
## Missing values in categorical

missing_percent_obj = df[obj_cols].isnull().mean() * 100

missing_data_obj = missing_percent_obj[missing_percent_obj > 0]
missing_data_obj = missing_percent_obj.sort_values(ascending=False)

In [13]:
missing_data_obj

employment_status    30.0112
device_os            29.3869
payment_type          0.0000
housing_status        0.0000
source                0.0000
dtype: float64

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean'))
])

# categorical pipeline

categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe_impute', OneHotEncoder())
])

In [16]:
preprocessor =ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,int_cols),
    ('categorical_pipeline',categorical_pipeline,obj_cols)
])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [19]:
y_train = df_train[['fraud_bool']]
X_train = df_train.drop(['fraud_bool'],axis=1)

y_test = df_test[['fraud_bool']]
X_test = df_test.drop(['fraud_bool'],axis=1)

In [20]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=-1, class_weight={0:1,1:90})

In [22]:
## Dagshub setup
import dagshub
import mlflow
dagshub.init(repo_owner='SrijanDeo-DA-DS', repo_name='bank-fraud-detection-with-deployment', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow')

mlflow.set_experiment("Logistic Regression Baseline")

<Experiment: artifact_location='mlflow-artifacts:/042e9950f11b4e43884885fd6be1ad55', creation_time=1729827384549, experiment_id='1', last_update_time=1729827384549, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [23]:
from sklearn.metrics import recall_score, classification_report

In [24]:
with mlflow.start_run():
    ## log preprocessing paramters
    mlflow.log_param("test_size",0.2)

    ## model building
    lr.fit(X_train, y_train)

    ## log model
    mlflow.log_param("model", "Logistic Regression")

    y_pred_train = lr.predict(X_train)
    y_pred_test = lr.predict(X_test)

    # logging recall (can log other metrics too)
    mlflow.log_metric("recall-train-data", recall_score(y_train, y_pred_train))
    mlflow.log_metric("recall-test-data", recall_score(y_test, y_pred_test))

    # logging classification report
    #mlflow.log_metric("classification_report-train-data", classification_report(y_train, y_pred_train))
    #mlflow.log_metric("classification-report-test-data", classification_report(y_test, y_pred_test))    

    # print recall
    print("Train data recall score - ",recall_score(y_train, y_pred_train))
    print("Test data recall score - ",recall_score(y_test, y_pred_test))

    # print classification report
    print("Train data classification report - ",classification_report(y_train, y_pred_train))
    print("Test data classification report - ",classification_report(y_test, y_pred_test))   

    # log model
    mlflow.sklearn.log_model(lr, "model")

    # log notebook
    import os
    notebook_path = "exp1_baseline_model.ipynb"
    os.system(f"Jupyer nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)



Train data recall score -  0.6708399726838151
Test data recall score -  0.6740971912617031
Train data classification report -                precision    recall  f1-score   support

           0       0.99      0.67      0.80    791214
           1       0.02      0.67      0.04      8786

    accuracy                           0.67    800000
   macro avg       0.51      0.67      0.42    800000
weighted avg       0.98      0.67      0.79    800000

Test data classification report -                precision    recall  f1-score   support

           0       0.99      0.67      0.80    197757
           1       0.02      0.67      0.04      2243

    accuracy                           0.67    200000
   macro avg       0.51      0.67      0.42    200000
weighted avg       0.98      0.67      0.79    200000



2024/10/29 23:14:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run respected-stoat-588 at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/1/runs/3e0a5ddff27148c7af57201ee8624b2d.
2024/10/29 23:14:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SrijanDeo-DA-DS/bank-fraud-detection-with-deployment.mlflow/#/experiments/1.
