# Pipeline Creation & Check for similarity between validation and test datasets.
In this notebook, the purpose is to cover the following ones:

1) Select cross-validation scheme
2) Check adversarial validation of validation and test sets
3) Create pipeline including data processing, transformations, hyper-parameter search and evaluation on the validation test sets

In the next notebook, we will finally evaluate the model to the test dataset

## Data importing & Pipeline creation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Manually restore the alias removed in NumPy 2.0
np.NaN = np.nan
np.bool = bool
np.float = float
import scipy.stats
import scipy.stats
import numpy as np
from pathlib import Path
from pathlib import Path
DATADIR= Path("/workspaces/fraud_detection_mlops/FraudDataset")
data_dir =DATADIR
if not data_dir.exists():
    raise FileNotFoundError(f"Data directory not found: {data_dir}")
extension = "csv"  # Change to "csv" if needed
data_paths = [str(p) for p in sorted(data_dir.glob(f"*.{extension}"))]
data_paths[0]
dataset=pd.read_csv(data_paths[0])
train_dataset=dataset.loc[dataset["month"]<6,:].copy()
test_dataset=dataset.loc[dataset["month"]>=6,:].copy()

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [28]:
train_dataset.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

In [29]:
train_dataset["bank_branch_count_8w"]

0          5
1          3
2         15
3         11
4          1
          ..
794984     2
794985    13
794986    14
794987     0
794988    10
Name: bank_branch_count_8w, Length: 794989, dtype: int64

Create necessary classes for pipeline

In [30]:
class ImportantColumnsDropper:
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X,y=None):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

class LogitTransformer:
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self, X,y=None):
        X['name_email_similarity_logit'] = np.log(X['name_email_similarity'] / (1 - X['name_email_similarity']))
        X.drop(columns='name_email_similarity', inplace=True)
        return X
    
class CategoricalTransformer:
    """
    Custom transformer for all categorical/ordinal feature engineering
    as described in the exploratory analysis section.
    """

    def __init__(self):
        # Store mappings if needed for transform
        self.income_threshold = 0.8
        self.age_threshold = 40
        self.payment_types = ['AA', 'AC']
        self.employment_types = ['CA', 'CB', 'CC']
        self.intended_balcon_merge = ['52-92', '92+']
        self.bank_branch_count_8w_binned_merge = ['<=42']
        # No fitting required for these rules, but for pipeline compatibility
        pass

    def fit(self, X,y=None):
        # No fitting needed, but method required for pipeline
        return self

    def transform(self, X,y=None):
        X = X.copy()

        # 1. income_level
        if 'income' in X.columns:
            X['income_level'] = np.where(X['income'] < self.income_threshold, '<0.8', '0.8')
            X.drop(columns='income', inplace=True)

        # 2. customer_age_binned
        if 'customer_age' in X.columns:
            X['customer_age_binned'] = np.where(X['customer_age'] > self.age_threshold, '>40 (More fraud)', '<=40 (less fraud)')
            X.drop(columns='customer_age', inplace=True)

        # 3. payment_type_engineered
        if 'payment_type' in X.columns:
            X['payment_type_engineered'] = np.where(~X['payment_type'].isin(self.payment_types), 'Other', X['payment_type'])
            X.drop(columns='payment_type', inplace=True)

        # 4. employment_status_engineered
        if 'employment_status' in X.columns:
            X['employment_status_engineered'] = np.where(~X['employment_status'].isin(self.employment_types), 'CD_CE_CF_CG', X['employment_status'])
            X.drop(columns='employment_status', inplace=True)


        # 5. intended_balcon_amount_binned_engineered
        if 'intended_balcon_amount' in X.columns:
            X['intended_balcon_amount_binned_engineered'] =  pd.cut(X['intended_balcon_amount'], bins=[-np.inf, 0,  52, np.inf], labels=['Missing/Negative', '0-52',  '52+'])
            X.drop(columns='intended_balcon_amount', inplace=True)

        # 6. phone_home_and_mobile_valid
        X["phone_home_valid"]=np.where(X["phone_home_valid"]==1, "Valid", "Invalid")
        X["phone_mobile_valid"]=np.where(X["phone_mobile_valid"]==1, "Valid", "Invalid")
        X["phone_home_and_mobile_valid"]=X["phone_home_valid"]+"_"+X["phone_mobile_valid"]
        #7.  banking_relationship_stability
        X["has_other_cards_binary"]=np.where(X["has_other_cards"]==1,"has_other_cards_valid","has_other_cards_invalid")
        X["bank_months_count_binary"]=np.where(X["bank_months_count"]<0,"bank_months_missing","bank_months_valid")
        X["banking_relationship_stability"]=X["bank_months_count_binary"]+"_"+X ["has_other_cards_binary"]
        # 8. prev_address_months_count_binary
        X["prev_address_months_count_binary"]=np.where(X["prev_address_months_count"]<0,"prev_address_missing","prev_address_valid")
        # 9. housing_status_engineered
        X['housing_status_engineered']=np.where(X["housing_status"].isin(['BG', 'BF', 'BD']),'Other',X["housing_status"])
        # 7. Drop columns as per analysis (if present)
        drop_cols = ["income", "customer_age", "payment_type", "employment_status","intended_balcon_amount",
                     "phone_home_valid", "phone_mobile_valid","has_other_cards_binary","bank_months_count","bank_months_count_binary",
                     "has_other_cards",
                     "prev_address_months_count", "housing_status"]
        for col in drop_cols:
            if col in X.columns:
                X.drop(columns=col, inplace=True)
        # 8. Convert the object dtype to categorical
        object_columns = X.select_dtypes(include='object').columns
        object_columns = list(object_columns) + ["email_is_free","keep_alive_session"]
        for col in object_columns:
            if col in X.columns:
                X[col] = X[col].astype('category')
        return X
    

In [31]:
# Define the pipeline
pipeline = Pipeline([
    ('logit_transformer', LogitTransformer()),
    ('categorical_transformer', CategoricalTransformer()),
        ('drop_unimportant_columns', ImportantColumnsDropper(columns_to_drop=['proposed_credit_limit', 'valid_transferred_amount', 'foreign_request', 
                                                                        'days_since_request','bank_branch_count_8w','velocity_6h',
                                                                        'velocity_24h', 'velocity_4w','zip_count_4w',
                                                                        'session_length_in_minutes','source','current_address_months_count',
                                                                        'device_fraud_count'
                                                                        ]))])

In [32]:
# aPPLY THE PIPELINe tot he test dataset and create a new copy of the test dataset
test_dataset_transformed=test_dataset.copy(deep=True)
test_dataset_transformed = pipeline.fit_transform(test_dataset_transformed)
test_dataset_transformed.columns

Index(['fraud_bool', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'email_is_free', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'month', 'name_email_similarity_logit',
       'income_level', 'customer_age_binned', 'payment_type_engineered',
       'employment_status_engineered',
       'intended_balcon_amount_binned_engineered',
       'phone_home_and_mobile_valid', 'banking_relationship_stability',
       'prev_address_months_count_binary', 'housing_status_engineered'],
      dtype='object')

In [33]:
test_dataset_transformed.dtypes

fraud_bool                                     int64
date_of_birth_distinct_emails_4w               int64
credit_risk_score                              int64
email_is_free                               category
device_os                                   category
keep_alive_session                          category
device_distinct_emails_8w                      int64
month                                          int64
name_email_similarity_logit                  float64
income_level                                category
customer_age_binned                         category
payment_type_engineered                     category
employment_status_engineered                category
intended_balcon_amount_binned_engineered    category
phone_home_and_mobile_valid                 category
banking_relationship_stability              category
prev_address_months_count_binary            category
housing_status_engineered                   category
dtype: object

In [35]:
test_dataset_transformed.head()

Unnamed: 0,fraud_bool,date_of_birth_distinct_emails_4w,credit_risk_score,email_is_free,device_os,keep_alive_session,device_distinct_emails_8w,month,name_email_similarity_logit,income_level,customer_age_binned,payment_type_engineered,employment_status_engineered,intended_balcon_amount_binned_engineered,phone_home_and_mobile_valid,banking_relationship_stability,prev_address_months_count_binary,housing_status_engineered
794989,0,5,250,0,other,0,1,6,0.145425,0.8,>40 (More fraud),AA,CA,0-52,Valid_Valid,bank_months_valid_has_other_cards_valid,prev_address_missing,BC
794990,0,9,-35,0,linux,1,1,6,0.374926,<0.8,<=40 (less fraud),AC,CD_CE_CF_CG,Missing/Negative,Invalid_Valid,bank_months_missing_has_other_cards_invalid,prev_address_missing,BB
794991,0,6,190,1,linux,1,1,6,-0.482763,<0.8,<=40 (less fraud),Other,CA,Missing/Negative,Invalid_Valid,bank_months_valid_has_other_cards_invalid,prev_address_missing,BB
794992,0,4,295,1,other,0,1,6,1.217168,<0.8,<=40 (less fraud),Other,CA,Missing/Negative,Valid_Valid,bank_months_valid_has_other_cards_invalid,prev_address_missing,BA
794993,0,8,-57,1,linux,1,1,6,2.376921,<0.8,<=40 (less fraud),AC,CD_CE_CF_CG,Missing/Negative,Valid_Valid,bank_months_missing_has_other_cards_invalid,prev_address_missing,BB
