# Pipeline Creation & Check for similarity between validation and test datasets.
In this notebook, the purpose is to cover the following ones:

1) Select cross-validation scheme
2) Check adversarial validation of validation and test sets
3) Create pipeline including data processing, transformations, hyper-parameter search and evaluation on the validation test sets

In the next notebook, we will finally evaluate the model to the test dataset

## Data importing & Pipeline creation

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Manually restore the alias removed in NumPy 2.0
np.NaN = np.nan
np.bool = bool
np.float = float
import scipy.stats
import scipy.stats
import numpy as np
from pathlib import Path

from pathlib import Path
DATADIR= Path("/workspaces/fraud-detection-BAF-Dataset-Suite-/FraudDataset")
data_dir =DATADIR
if not data_dir.exists():
    raise FileNotFoundError(f"Data directory not found: {data_dir}")
extension = "csv"  # Change to "csv" if needed
data_paths = [str(p) for p in sorted(data_dir.glob(f"*.{extension}"))]
data_paths[0]
dataset=pd.read_csv(data_paths[0])
train_dataset=dataset.loc[dataset["month"]<6,:].copy()
test_dataset=dataset.loc[dataset["month"]>=6,:].copy()

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [6]:
train_dataset.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

In [3]:
train_dataset["bank_branch_count_8w"]

fraud_bool
0     5
0     3
0    15
0    11
0     1
     ..
0     2
0    13
0    14
0     0
0    10
Name: bank_branch_count_8w, Length: 794989, dtype: int64

Create necessary classes for pipeline

In [None]:
class ImportantColumnsDropper:
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

class ContinuousTransformer:
    def __init__(self, columns):
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.scaler.transform(X[self.columns])
        return X_transformed
    
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer for all categorical/ordinal feature engineering
    as described in the exploratory analysis section.
    """

    def __init__(self):
        # Store mappings if needed for transform
        self.income_threshold = 0.8
        self.age_threshold = 40
        self.payment_types = ['AA', 'AC']
        self.employment_types = ['CA', 'CB', 'CC']
        self.intended_balcon_merge = ['52-92', '92+']
        self.bank_branch_count_8w_binned_merge = ['<=42']
        # No fitting required for these rules, but for pipeline compatibility
        pass

    def fit(self, X, y=None):
        # No fitting needed, but method required for pipeline
        return self

    def transform(self, X):
        X = X.copy()

        # 1. income_level
        if 'income' in X.columns:
            X['income_level'] = np.where(X['income'] < self.income_threshold, '<0.8', '0.8')
            X.drop(columns='income', inplace=True)

        # 2. customer_age_binned
        if 'customer_age' in X.columns:
            X['customer_age_binned'] = np.where(X['customer_age'] > self.age_threshold, '>40 (More fraud)', '<=40 (less fraud)')
            X.drop(columns='customer_age', inplace=True)

        # 3. payment_type_engineered
        if 'payment_type' in X.columns:
            X['payment_type_engineered'] = np.where(~X['payment_type'].isin(self.payment_types), 'Other', X['payment_type'])
            X.drop(columns='payment_type', inplace=True)

        # 4. employment_status_engineered
        if 'employment_status' in X.columns:
            X['employment_status_engineered'] = np.where(~X['employment_status'].isin(self.employment_types), 'CD_CE_CF_CG', X['employment_status'])
            X.drop(columns='employment_status', inplace=True)

        # 5. intended_balcon_amount_binned_engineered
        if 'intended_balcon_amount_binned' in X.columns:
            X['intended_balcon_amount_binned_engineered'] = np.where(
                X['intended_balcon_amount_binned'].isin(self.intended_balcon_merge),
                '52+',
                X['intended_balcon_amount_binned']
            )
            X.drop(columns='intended_balcon_amount_binned', inplace=True)

        # 7. Drop columns as per analysis (if present)
        drop_cols = ["income", "customer_age", "payment_type", "employment_status","intended_balcon_amount"]
        for col in drop_cols:
            if col in X.columns:
                X.drop(columns=col, inplace=True)

        return X
    

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('drop_unimportant_columns', ImportantColumnsDropper(columns_to_drop=['proposed_credit_limit', 'valid_transferred_amount', 'foreign_request', 
                                                                        'days_since_request','bank_branch_count_8w','velocity_6h',
                                                                        'velocity_24h', 'velocity_4w','zip_count_4w',
                                                                        'session_length_in_minutes','source','current_address_months_count',
                                                                        'device_fraud_count'
                                                                        ]),
    ('transformation_categoricals', CategoricalTransformer(columns=['merchant_id', 'customer_id'])),
    ('imputation', SimpleImputer(strategy='median')),
    ('logarithmic_transformation', LogTransformer(columns=['amount'])),