In [3]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import itertools
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from ExperimentTracker import PhaseOneExperimentTracker, PhaseTwoExperimentTracker, PhaseFourExperimentTracker, PhaseFiveExperimentTracker, PhaseSixExperimentTracker
, PhaseSevenExperimentTracker
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from dask_ml.model_selection import train_test_split

In [2]:
# Load large dataset with Dask
df = dd.read_csv("merged_data.csv")

In [3]:
# Convert data types for memory efficiency
def convert_dtypes(df):
    df['order_value'] = df['order_value'].astype('float32')
    df['refund_value'] = df['refund_value'].astype('float32')
    df['num_items_ordered'] = df['num_items_ordered'].astype(float).round().astype('uint8')
    df['order_date'] = dd.to_datetime(df['order_date'])
    df['first_order_datetime'] = dd.to_datetime(df['first_order_datetime'])
    df[['country_code', 'collect_type', 'payment_method']] = df[['country_code', 'collect_type', 'payment_method']].astype('category')
    return df

df = convert_dtypes(df)

In [4]:
# Payment method grouping
def group_payment_methods(payment_method):
    mapping = {
        'CreditCard': ['GenericCreditCard', 'CybersourceCreditCard', 'CybersourceApplePay', 'CreditCard'],
        'DigitalWallet': ['GCash', 'AFbKash', 'JazzCashWallet', 'AdyenBoost', 'PayPal'],
        'BankTransfer': ['XenditDirectDebit', 'RazerOnlineBanking'],
        'PaymentOnDelivery': ['Invoice', 'PayOnDelivery']
    }
    for key, values in mapping.items():
        if payment_method in values:
            return key
    return 'Others'

df['payment_method'] = df['payment_method'].map(group_payment_methods)

In [5]:
# Date transformations
def date_transformations(df):
    df['days_since_first_order'] = (df['order_date'] - df['first_order_datetime']).dt.days
    df = df.drop(columns=['first_order_datetime'])
    df['order_date_day_of_week'] = df['order_date'].dt.dayofweek
    df['order_date_day'] = df['order_date'].dt.day
    df['order_date_month'] = df['order_date'].dt.month
    df['order_date_year'] = df['order_date'].dt.year
    df = df.drop(columns=['order_date'])
    return df

df = date_transformations(df)
df = df.drop(columns=['order_id', 'customer_id'])

In [6]:
# Split data
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [None]:
# Define experiment configurations
search_space = {
    'scaler': [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
    'encode': [{'apply': True, 'columns': ['categorical_col']}, {'apply': False}],
    'models': [
        {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'GaussianNB', 'instance': GaussianNB()},
        {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
        {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()},
    ]
}

# Generate all combinations
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [8]:
print(len(experiment_combinations))

24


In [9]:
print(experiment_combinations,X_train.shape,y_train.shape,X_test.shape,y_test.shape,numeric_columns,categorical_cols)

[{'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'LogisticRegression', 'instance': LogisticRegression()}}, {'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'RandomForest', 'instance': RandomForestClassifier()}}, {'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'LightGBM', 'instance': LGBMClassifier()}}, {'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'GaussianNB', 'instance': GaussianNB()}}, {'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()}}, {'scaler': None, 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()}}, {'scaler': StandardScaler(), 'encode': {'apply': True, 'columns': ['categorical_col']}, 'models': {'name': 'L

In [None]:
# Initialize tracker and run experiments
tracker = PhaseOneExperimentTracker("Phase-1 (Final)")
tracker.run_experiments(experiment_combinations, X_train, y_train, X_test, y_test, numeric_columns, categorical_cols)

---

In [None]:
# Load datasets
X_train = dd.read_csv('./data/X_train.csv')
y_train = dd.read_csv('./data/y_train.csv')

X_test = dd.read_csv('./data/X_test.csv')
y_test = dd.read_csv('./data/y_test.csv')

X_train_LOF_ROS = dd.read_csv('./data/X_train_LOF_ros.csv')
y_train_LOF_ROS = dd.read_csv('./data/y_train_LOF_ros.csv')

X_train_LOF = dd.read_csv('./data/X_train_LOF.csv')
y_train_LOF = dd.read_csv('./data/y_train_LOF.csv')

datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
]

X_train_ISO = dd.read_csv('./data/X_train_ISO.csv')
y_train_ISO = dd.read_csv('./data/y_train_ISO.csv')

X_train_ISO_SMOTE = dd.read_csv('./data/X_train_ISO_smote.csv')
y_train_ISO_SMOTE = dd.read_csv('./data/y_train_ISO_smote.csv')

X_train_ISO_ROS = dd.read_csv('./data/X_train_ISO_ros.csv')
y_train_ISO_ROS = dd.read_csv('./data/y_train_ISO_ros.csv')

X_train_ISO_RUS = dd.read_csv('./data/X_train_ISO_rus.csv')
y_train_ISO_RUS = dd.read_csv('./data/y_train_ISO_rus.csv')


X_train_LOF_SMOTE = dd.read_csv('./data/X_train_LOF_smote.csv')
y_train_LOF_SMOTE = dd.read_csv('./data/y_train_LOF_smote.csv')


X_train_LOF_RUS = dd.read_csv('./data/X_train_LOF_rus.csv')
y_train_LOF_RUS = dd.read_csv('./data/y_train_LOF_rus.csv')

X_train_smote = dd.read_csv('./data/X_train_smote.csv')
y_train_smote = dd.read_csv('./data/y_train_smote.csv')

X_train_ros = dd.read_csv('./data/X_train_ros.csv')
y_train_ros = dd.read_csv('./data/y_train_ros.csv')

X_train_rus = dd.read_csv('./data/X_train_rus.csv')
y_train_rus = dd.read_csv('./data/y_train_rus.csv')

In [None]:
datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_ISO", X_train_ISO, y_train_ISO),
    ("dataset_ISO_SMOTE", X_train_ISO_SMOTE, y_train_ISO_SMOTE),
    ("dataset_ISO_ROS", X_train_ISO_ROS, y_train_ISO_ROS),
    ("dataset_ISO_RUS", X_train_ISO_RUS, y_train_ISO_RUS),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_SMOTE", X_train_LOF_SMOTE, y_train_LOF_SMOTE),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
    ("dataset_LOF_RUS", X_train_LOF_RUS, y_train_LOF_RUS),
    ("dataset_SMOTE", X_train_smote, y_train_smote),
    ("dataset_ROS", X_train_ros, y_train_ros),
    ("dataset_RUS", X_train_rus, y_train_rus)
]

In [15]:
# Define experiment configurations
search_space = {
    'scaler': [None],
    'encode': [{'apply': True, 'columns': ['categorical_col']}],
    'models': [
        {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'GaussianNB', 'instance': GaussianNB()},
        {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
        {'name': 'GradientBoosting', 'instance': GradientBoostingClassifier()},
    ]
}

# Generate all combinations
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [16]:
len(experiment_combinations)

6

In [None]:
# Initialize the tracker
tracker = PhaseTwoExperimentTracker("Phase-2 (Final)")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols
)

---

In [18]:
defined_experiment_combinations = [
    {
        "scaler": MinMaxScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "RandomForest", "instance": RandomForestClassifier()}
    },
    {
        "scaler": StandardScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "LightGBM", "instance": LGBMClassifier()}
    },
]

In [19]:
experiment_combinations

[{'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'LogisticRegression', 'instance': LogisticRegression()}},
 {'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'RandomForest', 'instance': RandomForestClassifier()}},
 {'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'LightGBM', 'instance': LGBMClassifier()}},
 {'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'GaussianNB', 'instance': GaussianNB()}},
 {'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()}},
 {'scaler': None,
  'encode': {'apply': True, 'columns': ['categorical_col']},
  'models': {'name': 'GradientBoosting',
   'instance': GradientBoostingClassifier()}}]

In [20]:
categorical_cols_reduced = ['country_code']
numeric_columns_reduced = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
tracker = PhaseFourExperimentTracker("Final Experiment")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=defined_experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns_reduced,
    categorical_cols=categorical_cols_reduced,
    drop_columns=['payment_method', 'collect_type', 'mobile_verified']
)

---

In [22]:
defined_experiment_combinations = [
    {
        "scaler": MinMaxScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "RandomForest", "instance": RandomForestClassifier()},
        "pca":{"apply":True, "n_components":0.95}
    },
    {
        "scaler": StandardScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "LightGBM", "instance": LGBMClassifier()},
        "pca":{"apply":True, "n_components":0.95}
    },
]

In [23]:
categorical_cols_reduced = ['country_code']
numeric_columns_reduced = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
tracker = PhaseFiveExperimentTracker("Final Experiment")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=defined_experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns_reduced,
    categorical_cols=categorical_cols_reduced,
    drop_columns=['payment_method', 'collect_type', 'mobile_verified']
)

---

In [25]:
defined_experiment_combinations = [
    {
        "scaler": MinMaxScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "RandomForest", "instance": RandomForestClassifier()},
        "params": {
        "model__n_estimators": [100, 200, 300],
        "model__max_depth": [10, 20, None],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4]
    }}
]

In [None]:
categorical_cols_reduced = ['country_code']
numeric_columns_reduced = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
]

tracker = PhaseSixExperimentTracker("Final Experiment")

tracker.completed_runs

# Pass the scattered datasets to the experiment tracker
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=defined_experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns_reduced,
    categorical_cols=categorical_cols_reduced,
    drop_columns=['payment_method', 'collect_type', 'mobile_verified'],
    n_iter=10
)

In [None]:
from ExperimentTracker2 import PhaseSixExperimentTracker
tracker = PhaseSixExperimentTracker("Final Experiment")

tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=defined_experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns_reduced,
    categorical_cols=categorical_cols_reduced,
    drop_columns=['payment_method', 'collect_type', 'mobile_verified'],
    n_iter=10
)

---

In [2]:
defined_experiment_combinations = [
    {
        "scaler": StandardScaler(),
        "encode": {"apply": True, "columns": ["categorical_col"]},
        "models": {"name": "LightGBM", "instance": LGBMClassifier()},
        "params": {"model__learning_rate": [0.01, 0.03, 0.05, 1], 
                    "model__max_depth": [3, 5, 7, 10, -1],
                    "model__min_samples_split": [2, 5, 10, 20],
                    "model__min_samples_leaf": [1, 5, 10, 20]}
    },
]

In [3]:
categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'days_since_first_order',
                   'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [4]:
X_train = dd.read_csv('./data/X_train.csv')
y_train = dd.read_csv('./data/y_train.csv')
X_test = dd.read_csv('./data/X_test.csv')
y_test = dd.read_csv('./data/y_test.csv')
X_train_LOF_ROS = dd.read_csv('./data/X_train_LOF_ros.csv')
y_train_LOF_ROS = dd.read_csv('./data/y_train_LOF_ros.csv')
X_train_LOF = dd.read_csv('./data/X_train_LOF.csv')
y_train_LOF = dd.read_csv('./data/y_train_LOF.csv')

# Update the datasets list with scattered futures
datasets = [
    ("dataset_default", X_train, y_train),
    ("dataset_LOF", X_train_LOF, y_train_LOF),
    ("dataset_LOF_ROS", X_train_LOF_ROS, y_train_LOF_ROS),
]

In [None]:
from ExperimentTracker2 import PhaseSevenExperimentTracker
tracker = PhaseSevenExperimentTracker("Final Experiment")

tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    datasets=datasets,
    experiment_combinations=defined_experiment_combinations,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols,
    n_iter=100
)

Starting run: LI_Standard_Enc_202502140301_hypertuned




















Completed run: LI_Standard_Enc_202502140301_hypertuned
🏃 View run LI_Standard_Enc_202502140301_hypertuned at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/20/runs/32d32f553e6d46e28b8bd0898a359397
🧪 View experiment at: https://dagshub.com/REHXZ/PAI_CA2.mlflow/#/experiments/20
end run
