In [1]:
import pandas as pd
import numpy as np
import os
import json
import time
from pathlib import Path

# from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer

import warnings
from urllib.parse import urlparse

import mlflow
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier





# Show all columns
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./data/merged_data.csv')

merged_df=data.copy()

In [3]:
def convert_dtypes(df):
    # Convert 'order_value' and 'refund_value' to float16 for memory efficiency
    df['order_value'] = df['order_value'].astype('float32')
    df['refund_value'] = df['refund_value'].astype('float32')
    
    # Convert 'num_items_ordered' to uint8 after rounding
    df['num_items_ordered'] = df['num_items_ordered'].astype(float).round().astype('uint8')
    
    # Convert 'order_date' and 'first_order_datetime' to datetime
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['first_order_datetime'] = pd.to_datetime(df['first_order_datetime'])
    
    # Convert categorical columns to category dtype for efficiency
    df[['country_code', 'collect_type', 'payment_method']] = df[['country_code', 'collect_type', 'payment_method']].astype('category')
    
    # Convert numerical columns (those that represent counts or numeric features) to uint16
    df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']] = df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']].astype('uint16')
    
    # Convert 'num_associated_customers' to uint8 for efficient memory usage
    df['num_associated_customers'] = df['num_associated_customers'].astype('uint8')
    
    # Convert 'total_payment_last_50days' to float16 for memory efficiency
    df['total_payment_last_50days'] = df['total_payment_last_50days'].astype('float32')
    
    # Convert 'mobile_verified' and 'is_fraud' columns to boolean (mapping string values)
    # df['mobile_verified'] = df['mobile_verified'].map({'True': True, 'False': False})
    # df['is_fraud'] = df['is_fraud'].map({'1': True, '0': False})
    
    return df

In [4]:
def group_payment_methods(payment_method):
    # Credit Card and Related Gateways
    if payment_method in ['GenericCreditCard', 'CybersourceCreditCard', 'CybersourceApplePay', 'CreditCard']:
        return 'CreditCard'
    
    # Digital Wallets
    elif payment_method in ['GCash', 'AFbKash', 'JazzCashWallet', 'AFTrueMoney', 'AdyenBoost', 'AdyenMolpay',
                            'AFTNG', 'AdyenHPPBoost', 'AdyenHPPMolpay', 'PayPal', 'AFGCash', 'AccountBalance']:
        return 'DigitalWallet'
    
    # Bank Transfers and Direct Debit
    elif payment_method in ['XenditDirectDebit', 'RazerOnlineBanking']:
        return 'BankTransfer'
    
    # PayOnDelivery
    elif payment_method in ['Invoice', 'PayOnDelivery']:
        return 'PaymentOnDelivery'
    
    # Default case for unrecognized methods
    else:
        return 'Others'

In [5]:
def days_since_first_order(data, order_date_column, first_order_column):
    # Create a feature for the number of days since the first order
    data['days_since_first_order'] = (data[order_date_column] - data[first_order_column]).dt.days
    data.drop([first_order_column], axis=1, inplace=True)
    return data


def transform_datetime(data, column):
    # Handle Datetime columns
    data[column + '_day_of_week'] = data[column].dt.dayofweek
    data[column + '_day'] = data[column].dt.day
    data[column + '_month'] = data[column].dt.month
    data[column + '_year'] = data[column].dt.year
    data.drop([column], axis=1, inplace=True)
    return data

def date_transformations(data):
    data = days_since_first_order(data, 'order_date', 'first_order_datetime')
    data = transform_datetime(data, 'order_date')

    return data

In [6]:
merged_df = convert_dtypes(merged_df)
merged_df['payment_method'] = merged_df['payment_method'].apply(group_payment_methods)
merged_df = date_transformations(merged_df)
merged_df.drop(columns=['order_id', 'customer_id'], inplace=True)
X=merged_df.drop(columns=['is_fraud'])
y=merged_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
from ExperimentTrackers import PhaseOneExperimentTracker

In [8]:
# Define the search space
search_space = search_space = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), None],
    'encode': [{'apply': True, 'columns': ['categorical_col']}, {'apply': False}],
    'models': [
        {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'GaussianNB', 'instance': GaussianNB()},
        {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
        {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()},
    ]
}

# Generate all combinations of the search space
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days', 'num_associated_customers', 'total_payment_last_50days', 'days_since_first_order', 'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
# Initialize the tracker
tracker = PhaseOneExperimentTracker("Phase1.1")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    experiment_combinations=experiment_combinations,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols
)

Starting run: LO_Standard_Enc_20250207_2236




Completed run: LO_Standard_Enc_20250207_2236
🏃 View run LO_Standard_Enc_20250207_2236 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/a89b4464b73d4e9b851ac680b39b4ab0
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_Standard_Enc_20250207_2238




Completed run: RA_Standard_Enc_20250207_2238
🏃 View run RA_Standard_Enc_20250207_2238 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/e13f138e8d174462830fa827f43c717b
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_Standard_Enc_20250207_2306
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1723
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_Standard_Enc_20250207_2306
🏃 View run LI_Standard_Enc_20250207_2306 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/78576872098641f1af8e90b7ff9a4640
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_Standard_Enc_20250207_2308




Completed run: GA_Standard_Enc_20250207_2308
🏃 View run GA_Standard_Enc_20250207_2308 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/1034a64df4af4a6fa89b712101d08ef2
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_Standard_Enc_20250207_2309




Completed run: DE_Standard_Enc_20250207_2309
🏃 View run DE_Standard_Enc_20250207_2309 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/396d460e017147cb9ec6dc95e4e23402
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_Standard_Enc_20250207_2311




Completed run: GR_Standard_Enc_20250207_2311
🏃 View run GR_Standard_Enc_20250207_2311 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/25aff16f62104e3784a67753616b7db3
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_Standard_NoEnc_20250207_2349




Completed run: LO_Standard_NoEnc_20250207_2349
🏃 View run LO_Standard_NoEnc_20250207_2349 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/cf8106ecd11b42e996646e716d59a6e0
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_Standard_NoEnc_20250207_2350




Completed run: RA_Standard_NoEnc_20250207_2350
🏃 View run RA_Standard_NoEnc_20250207_2350 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/86531b940f7947d6889f38c9252264a0
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_Standard_NoEnc_20250208_0014
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1699
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_Standard_NoEnc_20250208_0014
🏃 View run LI_Standard_NoEnc_20250208_0014 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/7017b70dc9eb47d6970597374d779fb5
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_Standard_NoEnc_20250208_0015




Completed run: GA_Standard_NoEnc_20250208_0015
🏃 View run GA_Standard_NoEnc_20250208_0015 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/f7c63ba88dd44a579309db5b7d1855b6
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_Standard_NoEnc_20250208_0016




Completed run: DE_Standard_NoEnc_20250208_0016
🏃 View run DE_Standard_NoEnc_20250208_0016 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/d1b111920ea04ba5bd39ed0010d00957
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_Standard_NoEnc_20250208_0017




Completed run: GR_Standard_NoEnc_20250208_0017
🏃 View run GR_Standard_NoEnc_20250208_0017 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/2310f39507f14201b3b4f0bd3894684c
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_MinMax_Enc_20250208_0042




Completed run: LO_MinMax_Enc_20250208_0042
🏃 View run LO_MinMax_Enc_20250208_0042 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/7bd2942dfce8477ab19c81d0d9ba206f
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_MinMax_Enc_20250208_0044




Completed run: RA_MinMax_Enc_20250208_0044
🏃 View run RA_MinMax_Enc_20250208_0044 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/79332447eaad4e3fbbc2a438791b8a3a
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_MinMax_Enc_20250208_0107
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1715
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_MinMax_Enc_20250208_0107
🏃 View run LI_MinMax_Enc_20250208_0107 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/33fd37cd81d44d18b13034edfc552d8d
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_MinMax_Enc_20250208_0109




Completed run: GA_MinMax_Enc_20250208_0109
🏃 View run GA_MinMax_Enc_20250208_0109 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/ba2cf35c3fcd4f0e99a5a7bb40977d30
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_MinMax_Enc_20250208_0110




Completed run: DE_MinMax_Enc_20250208_0110
🏃 View run DE_MinMax_Enc_20250208_0110 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/e3267cb445744a11be58e270020825c0
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_MinMax_Enc_20250208_0111




Completed run: GR_MinMax_Enc_20250208_0111
🏃 View run GR_MinMax_Enc_20250208_0111 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/32c2f899aff04b11932b75b4e4ca683f
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_MinMax_NoEnc_20250208_0147




Completed run: LO_MinMax_NoEnc_20250208_0147
🏃 View run LO_MinMax_NoEnc_20250208_0147 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/4082f3f7635642d1a6c9a1b71109565e
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_MinMax_NoEnc_20250208_0148




Completed run: RA_MinMax_NoEnc_20250208_0148
🏃 View run RA_MinMax_NoEnc_20250208_0148 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/7ace0e2a6dc345c3becf46fed315e43a
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_MinMax_NoEnc_20250208_0210
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_MinMax_NoEnc_20250208_0210
🏃 View run LI_MinMax_NoEnc_20250208_0210 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/4d51deadc5484cb2ba5a456c9a5e6df2
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_MinMax_NoEnc_20250208_0211




Completed run: GA_MinMax_NoEnc_20250208_0211
🏃 View run GA_MinMax_NoEnc_20250208_0211 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/b568098587bc4176adcf89de6299a3af
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_MinMax_NoEnc_20250208_0212




Completed run: DE_MinMax_NoEnc_20250208_0212
🏃 View run DE_MinMax_NoEnc_20250208_0212 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/923b29b4a9014b2e8907f1abc89c9001
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_MinMax_NoEnc_20250208_0213




Completed run: GR_MinMax_NoEnc_20250208_0213
🏃 View run GR_MinMax_NoEnc_20250208_0213 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/c6279af3d0124d89b5aee61dbcd0f897
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_Robust_Enc_20250208_0237




Completed run: LO_Robust_Enc_20250208_0237
🏃 View run LO_Robust_Enc_20250208_0237 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/ca4091e1a7524d9aa8e21680dcaf6541
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_Robust_Enc_20250208_0239




Completed run: RA_Robust_Enc_20250208_0239
🏃 View run RA_Robust_Enc_20250208_0239 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/e5cd2fc9fa464d3ea672ff1828e18a3f
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_Robust_Enc_20250208_0302
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1715
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_Robust_Enc_20250208_0302
🏃 View run LI_Robust_Enc_20250208_0302 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/216b9cabc727492fa3a044db9de3d185
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_Robust_Enc_20250208_0303




Completed run: GA_Robust_Enc_20250208_0303
🏃 View run GA_Robust_Enc_20250208_0303 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/348990422ed24503b9e4ca3e6f230b83
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_Robust_Enc_20250208_0304




Completed run: DE_Robust_Enc_20250208_0304
🏃 View run DE_Robust_Enc_20250208_0304 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/d365006601154380b2241016e66782c9
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_Robust_Enc_20250208_0306




Completed run: GR_Robust_Enc_20250208_0306
🏃 View run GR_Robust_Enc_20250208_0306 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/ed2d154bb2574bfd85c176780816a61d
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_Robust_NoEnc_20250208_0340




Completed run: LO_Robust_NoEnc_20250208_0340
🏃 View run LO_Robust_NoEnc_20250208_0340 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/cbcc06e977044153bfd6f2a85bdf7afe
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_Robust_NoEnc_20250208_0342




Completed run: RA_Robust_NoEnc_20250208_0342
🏃 View run RA_Robust_NoEnc_20250208_0342 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/f851e8388f44463d9a65dab687d8d017
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_Robust_NoEnc_20250208_0406
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_Robust_NoEnc_20250208_0406
🏃 View run LI_Robust_NoEnc_20250208_0406 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/e18cd40e0d6240e4b1eff57737a0ae63
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_Robust_NoEnc_20250208_0408




Completed run: GA_Robust_NoEnc_20250208_0408
🏃 View run GA_Robust_NoEnc_20250208_0408 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/5184ff256f9e4d3d9a9c63a3c50e7b17
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_Robust_NoEnc_20250208_0408




Completed run: DE_Robust_NoEnc_20250208_0408
🏃 View run DE_Robust_NoEnc_20250208_0408 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/95956a1c69c643e094040b4235634c65
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_Robust_NoEnc_20250208_0409




Completed run: GR_Robust_NoEnc_20250208_0409
🏃 View run GR_Robust_NoEnc_20250208_0409 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/126f08423a1140379bffeea396af79eb
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_NoneType_Enc_20250208_0433




Completed run: LO_NoneType_Enc_20250208_0433
🏃 View run LO_NoneType_Enc_20250208_0433 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/38800f115fbe400784891305847377cd
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_NoneType_Enc_20250208_0435




Completed run: RA_NoneType_Enc_20250208_0435
🏃 View run RA_NoneType_Enc_20250208_0435 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/3cf6554e7f634341a4e34951902f0f04
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_NoneType_Enc_20250208_0459
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1720
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_NoneType_Enc_20250208_0459
🏃 View run LI_NoneType_Enc_20250208_0459 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/53d08c3a269b4fd6b448d506ce36a793
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_NoneType_Enc_20250208_0501




Completed run: GA_NoneType_Enc_20250208_0501
🏃 View run GA_NoneType_Enc_20250208_0501 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/25c6a7192962416983a8d30c0bc94d5a
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_NoneType_Enc_20250208_0502




Completed run: DE_NoneType_Enc_20250208_0502
🏃 View run DE_NoneType_Enc_20250208_0502 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/712691e07c634b8b80c4dfba031ce2d5
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_NoneType_Enc_20250208_0504




Completed run: GR_NoneType_Enc_20250208_0504
🏃 View run GR_NoneType_Enc_20250208_0504 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/19fcc46f0cf542c4892fa616e6751230
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LO_NoneType_NoEnc_20250208_0539




Completed run: LO_NoneType_NoEnc_20250208_0539
🏃 View run LO_NoneType_NoEnc_20250208_0539 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/1b3dceb24e1d409cb7b2fda218144bce
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: RA_NoneType_NoEnc_20250208_0541




Completed run: RA_NoneType_NoEnc_20250208_0541
🏃 View run RA_NoneType_NoEnc_20250208_0541 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/516cd11c2504490db67e40d1dd400e10
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: LI_NoneType_NoEnc_20250208_0603
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1696
[LightGBM] [Info] Number of data points in the train set: 1810944, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.110156 -> initscore=-2.089151
[LightGBM] [Info] Start training from score -2.089151




Completed run: LI_NoneType_NoEnc_20250208_0603
🏃 View run LI_NoneType_NoEnc_20250208_0603 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/2e3b0091029d4d2285a3821a95b66a9f
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GA_NoneType_NoEnc_20250208_0605




Completed run: GA_NoneType_NoEnc_20250208_0605
🏃 View run GA_NoneType_NoEnc_20250208_0605 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/61bc19b23b8146ada753fe7d5a6f437d
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: DE_NoneType_NoEnc_20250208_0606




Completed run: DE_NoneType_NoEnc_20250208_0606
🏃 View run DE_NoneType_NoEnc_20250208_0606 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/7b2d42de258943bd8073bce2394b5daa
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7
Starting run: GR_NoneType_NoEnc_20250208_0607




Completed run: GR_NoneType_NoEnc_20250208_0607
🏃 View run GR_NoneType_NoEnc_20250208_0607 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7/runs/12268145beac42fd9a60203500cfbf3a
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/7


In [11]:
# Load datasets
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

X_test = pd.read_csv('./data/X_test.csv')
y_test = pd.read_csv('./data/y_test.csv')

X_train_ISO = pd.read_csv('./data/X_train_ISO.csv')
y_train_ISO = pd.read_csv('./data/y_train_ISO.csv')

X_train_ISO_SMOTE = pd.read_csv('./data/X_train_ISO_smote.csv')
y_train_ISO_SMOTE = pd.read_csv('./data/y_train_ISO_smote.csv')

X_train_ISO_ROS = pd.read_csv('./data/X_train_ISO_ros.csv')
y_train_ISO_ROS = pd.read_csv('./data/y_train_ISO_ros.csv')

X_train_ISO_RUS = pd.read_csv('./data/X_train_ISO_rus.csv')
y_train_ISO_RUS = pd.read_csv('./data/y_train_ISO_rus.csv')

X_train_LOF = pd.read_csv('./data/X_train_LOF.csv')
y_train_LOF = pd.read_csv('./data/y_train_LOF.csv')

X_train_LOF_SMOTE = pd.read_csv('./data/X_train_LOF_smote.csv')
y_train_LOF_SMOTE = pd.read_csv('./data/y_train_LOF_smote.csv')

X_train_LOF_ROS = pd.read_csv('./data/X_train_LOF_ros.csv')
y_train_LOF_ROS = pd.read_csv('./data/y_train_LOF_ros.csv')

X_train_LOF_RUS = pd.read_csv('./data/X_train_LOF_rus.csv')
y_train_LOF_RUS = pd.read_csv('./data/y_train_LOF_rus.csv')

X_train_smote = pd.read_csv('./data/X_train_smote.csv')
y_train_smote = pd.read_csv('./data/y_train_smote.csv')

X_train_ros = pd.read_csv('./data/X_train_ros.csv')
y_train_ros = pd.read_csv('./data/y_train_ros.csv')

X_train_rus = pd.read_csv('./data/X_train_rus.csv')
y_train_rus = pd.read_csv('./data/y_train_rus.csv')

In [12]:
datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_ISO", X_train_ISO, y_train_ISO),
    ("dataset_ISO_SMOTE", X_train_ISO_SMOTE, y_train_ISO_SMOTE),
    ("dataset_ISO_ROS", X_train_ISO_ROS, y_train_ISO_ROS),
    ("dataset_ISO_RUS", X_train_ISO_RUS, y_train_ISO_RUS),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_SMOTE", X_train_LOF_SMOTE, y_train_LOF_SMOTE),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
    ("dataset_LOF_RUS", X_train_LOF_RUS, y_train_LOF_RUS),
    ("dataset_SMOTE", X_train_smote, y_train_smote),
    ("dataset_ROS", X_train_ros, y_train_ros),
    ("dataset_RUS", X_train_rus, y_train_rus)
]

In [14]:
# Initialize the tracker
from ExperimentTrackers import PhaseTwoExperimentTracker
tracker = PhaseTwoExperimentTracker("Phase2_V2")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols
)

Starting run: LO_NoneType_Enc_20250129_0328
Completed run: LO_NoneType_Enc_20250129_0328
🏃 View run LO_NoneType_Enc_20250129_0328 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/5/runs/c538c919fe7548818f95338fbfdca19e
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/5
Starting run: RA_NoneType_Enc_20250129_0328
Completed run: RA_NoneType_Enc_20250129_0328
🏃 View run RA_NoneType_Enc_20250129_0328 at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/5/runs/e55d6062ccb04270811e019fe2d3af50
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/5
Starting run: LI_NoneType_Enc_20250129_0337
[LightGBM] [Info] Number of positive: 199486, number of negative: 1611458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1744
[Light