In [1]:
import pandas as pd
import numpy as np
import os
import json
import time
from pathlib import Path

# from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer

import warnings
from urllib.parse import urlparse

import mlflow
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier





# Show all columns
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./data/merged_data.csv')

merged_df=data.copy()

In [3]:
def convert_dtypes(df):
    # Convert 'order_value' and 'refund_value' to float16 for memory efficiency
    df['order_value'] = df['order_value'].astype('float32')
    df['refund_value'] = df['refund_value'].astype('float32')
    
    # Convert 'num_items_ordered' to uint8 after rounding
    df['num_items_ordered'] = df['num_items_ordered'].astype(float).round().astype('uint8')
    
    # Convert 'order_date' and 'first_order_datetime' to datetime
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['first_order_datetime'] = pd.to_datetime(df['first_order_datetime'])
    
    # Convert categorical columns to category dtype for efficiency
    df[['country_code', 'collect_type', 'payment_method']] = df[['country_code', 'collect_type', 'payment_method']].astype('category')
    
    # Convert numerical columns (those that represent counts or numeric features) to uint16
    df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']] = df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']].astype('uint16')
    
    # Convert 'num_associated_customers' to uint8 for efficient memory usage
    df['num_associated_customers'] = df['num_associated_customers'].astype('uint8')
    
    # Convert 'total_payment_last_50days' to float16 for memory efficiency
    df['total_payment_last_50days'] = df['total_payment_last_50days'].astype('float32')
    
    # Convert 'mobile_verified' and 'is_fraud' columns to boolean (mapping string values)
    # df['mobile_verified'] = df['mobile_verified'].map({'True': True, 'False': False})
    # df['is_fraud'] = df['is_fraud'].map({'1': True, '0': False})
    
    return df

In [4]:
def group_payment_methods(payment_method):
    # Credit Card and Related Gateways
    if payment_method in ['GenericCreditCard', 'CybersourceCreditCard', 'CybersourceApplePay', 'CreditCard']:
        return 'CreditCard'
    
    # Digital Wallets
    elif payment_method in ['GCash', 'AFbKash', 'JazzCashWallet', 'AFTrueMoney', 'AdyenBoost', 'AdyenMolpay',
                            'AFTNG', 'AdyenHPPBoost', 'AdyenHPPMolpay', 'PayPal', 'AFGCash', 'AccountBalance']:
        return 'DigitalWallet'
    
    # Bank Transfers and Direct Debit
    elif payment_method in ['XenditDirectDebit', 'RazerOnlineBanking']:
        return 'BankTransfer'
    
    # PayOnDelivery
    elif payment_method in ['Invoice', 'PayOnDelivery']:
        return 'PaymentOnDelivery'
    
    # Default case for unrecognized methods
    else:
        return 'Others'

In [5]:
def days_since_first_order(data, order_date_column, first_order_column):
    # Create a feature for the number of days since the first order
    data['days_since_first_order'] = (data[order_date_column] - data[first_order_column]).dt.days
    data.drop([first_order_column], axis=1, inplace=True)
    return data


def transform_datetime(data, column):
    # Handle Datetime columns
    data[column + '_day_of_week'] = data[column].dt.dayofweek
    data[column + '_day'] = data[column].dt.day
    data[column + '_month'] = data[column].dt.month
    data[column + '_year'] = data[column].dt.year
    data.drop([column], axis=1, inplace=True)
    return data

def date_transformations(data):
    data = days_since_first_order(data, 'order_date', 'first_order_datetime')
    data = transform_datetime(data, 'order_date')

    return data

In [6]:
merged_df = convert_dtypes(merged_df)
merged_df['payment_method'] = merged_df['payment_method'].apply(group_payment_methods)
merged_df = date_transformations(merged_df)
merged_df.drop(columns=['order_id', 'customer_id'], inplace=True)
X=merged_df.drop(columns=['is_fraud'])
y=merged_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
from ExperimentTrackers import PhaseOneExperimentTracker

In [8]:
# Define the search space
search_space = search_space = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), None],
    'encode': [{'apply': True, 'columns': ['categorical_col']}, {'apply': False}],
    'models': [
        {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'GaussianNB', 'instance': GaussianNB()},
        {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
        {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()},
    ]
}

# Generate all combinations of the search space
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days', 'num_associated_customers', 'total_payment_last_50days', 'days_since_first_order', 'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [9]:
# # Initialize the tracker
# tracker = PhaseOneExperimentTracker("Phase1.2")

# # Load checkpoint file
# tracker.completed_runs

# # Run experiments with checkpointing
# tracker.run_experiments(
#     experiment_combinations=experiment_combinations,
#     X_train=X_train,
#     y_train=y_train,
#     X_test=X_test,
#     y_test=y_test,
#     numeric_columns=numeric_columns,
#     categorical_cols=categorical_cols
# )

In [10]:
# Load datasets
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

X_test = pd.read_csv('./data/X_test.csv')
y_test = pd.read_csv('./data/y_test.csv')

X_train_ISO = pd.read_csv('./data/X_train_ISO.csv')
y_train_ISO = pd.read_csv('./data/y_train_ISO.csv')

X_train_ISO_SMOTE = pd.read_csv('./data/X_train_ISO_smote.csv')
y_train_ISO_SMOTE = pd.read_csv('./data/y_train_ISO_smote.csv')

X_train_ISO_ROS = pd.read_csv('./data/X_train_ISO_ros.csv')
y_train_ISO_ROS = pd.read_csv('./data/y_train_ISO_ros.csv')

X_train_ISO_RUS = pd.read_csv('./data/X_train_ISO_rus.csv')
y_train_ISO_RUS = pd.read_csv('./data/y_train_ISO_rus.csv')

X_train_LOF = pd.read_csv('./data/X_train_LOF.csv')
y_train_LOF = pd.read_csv('./data/y_train_LOF.csv')

X_train_LOF_SMOTE = pd.read_csv('./data/X_train_LOF_smote.csv')
y_train_LOF_SMOTE = pd.read_csv('./data/y_train_LOF_smote.csv')

X_train_LOF_ROS = pd.read_csv('./data/X_train_LOF_ros.csv')
y_train_LOF_ROS = pd.read_csv('./data/y_train_LOF_ros.csv')

X_train_LOF_RUS = pd.read_csv('./data/X_train_LOF_rus.csv')
y_train_LOF_RUS = pd.read_csv('./data/y_train_LOF_rus.csv')

X_train_smote = pd.read_csv('./data/X_train_smote.csv')
y_train_smote = pd.read_csv('./data/y_train_smote.csv')

X_train_ros = pd.read_csv('./data/X_train_ros.csv')
y_train_ros = pd.read_csv('./data/y_train_ros.csv')

X_train_rus = pd.read_csv('./data/X_train_rus.csv')
y_train_rus = pd.read_csv('./data/y_train_rus.csv')

In [11]:
datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_ISO", X_train_ISO, y_train_ISO),
    ("dataset_ISO_SMOTE", X_train_ISO_SMOTE, y_train_ISO_SMOTE),
    ("dataset_ISO_ROS", X_train_ISO_ROS, y_train_ISO_ROS),
    ("dataset_ISO_RUS", X_train_ISO_RUS, y_train_ISO_RUS),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_SMOTE", X_train_LOF_SMOTE, y_train_LOF_SMOTE),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
    ("dataset_LOF_RUS", X_train_LOF_RUS, y_train_LOF_RUS),
    ("dataset_SMOTE", X_train_smote, y_train_smote),
    ("dataset_ROS", X_train_ros, y_train_ros),
    ("dataset_RUS", X_train_rus, y_train_rus)
]

In [12]:
# Define the search space
search_space = {
    'scaler': [None],
    'encode': [{'apply': True, 'columns': ['categorical_col']}],
    'models': [
        {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'GaussianNB', 'instance': GaussianNB()},
        {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
        {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()},
    ]
}

# Generate all combinations of the search space
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days', 'num_associated_customers', 'total_payment_last_50days', 'days_since_first_order', 'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
# Initialize the tracker
from ExperimentTrackers import PhaseTwoExperimentTracker
tracker = PhaseTwoExperimentTracker("Phase2.4")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols
)


















































































































































































Starting run: LO_NoneType_Enc_20250209_0257
Completed run: LO_NoneType_Enc_20250209_0257
🏃 View run LO_NoneType_Enc_20250209_0257 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/12/runs/c075b9b709b945c594cf73e4b6339162
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/12
Starting run: RA_NoneType_Enc_20250209_0301
Completed run: RA_NoneType_Enc_20250209_0301
🏃 View run RA_NoneType_Enc_20250209_0301 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/12/runs/0d65766b4aa140bca86fae383da0e89d
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/12
Starting run: LI_NoneType_Enc_20250209_0327
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1744
[L