In [1]:
import pandas as pd
import numpy as np
import os
import json
import time
from pathlib import Path

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer

import warnings
from urllib.parse import urlparse

import mlflow
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


# Show all columns
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

  from pandas_profiling import ProfileReport


In [2]:
data = pd.read_csv('./data/merged_data.csv')

merged_df=data.copy()

In [3]:
def convert_dtypes(df):
    # Convert 'order_value' and 'refund_value' to float16 for memory efficiency
    df['order_value'] = df['order_value'].astype('float32')
    df['refund_value'] = df['refund_value'].astype('float32')
    
    # Convert 'num_items_ordered' to uint8 after rounding
    df['num_items_ordered'] = df['num_items_ordered'].astype(float).round().astype('uint8')
    
    # Convert 'order_date' and 'first_order_datetime' to datetime
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['first_order_datetime'] = pd.to_datetime(df['first_order_datetime'])
    
    # Convert categorical columns to category dtype for efficiency
    df[['country_code', 'collect_type', 'payment_method']] = df[['country_code', 'collect_type', 'payment_method']].astype('category')
    
    # Convert numerical columns (those that represent counts or numeric features) to uint16
    df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']] = df[['num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days']].astype('uint16')
    
    # Convert 'num_associated_customers' to uint8 for efficient memory usage
    df['num_associated_customers'] = df['num_associated_customers'].astype('uint8')
    
    # Convert 'total_payment_last_50days' to float16 for memory efficiency
    df['total_payment_last_50days'] = df['total_payment_last_50days'].astype('float32')
    
    # Convert 'mobile_verified' and 'is_fraud' columns to boolean (mapping string values)
    # df['mobile_verified'] = df['mobile_verified'].map({'True': True, 'False': False})
    # df['is_fraud'] = df['is_fraud'].map({'1': True, '0': False})
    
    return df

In [4]:
def group_payment_methods(payment_method):
    # Credit Card and Related Gateways
    if payment_method in ['GenericCreditCard', 'CybersourceCreditCard', 'CybersourceApplePay', 'CreditCard']:
        return 'CreditCard'
    
    # Digital Wallets
    elif payment_method in ['GCash', 'AFbKash', 'JazzCashWallet', 'AFTrueMoney', 'AdyenBoost', 'AdyenMolpay',
                            'AFTNG', 'AdyenHPPBoost', 'AdyenHPPMolpay', 'PayPal', 'AFGCash', 'AccountBalance']:
        return 'DigitalWallet'
    
    # Bank Transfers and Direct Debit
    elif payment_method in ['XenditDirectDebit', 'RazerOnlineBanking']:
        return 'BankTransfer'
    
    # PayOnDelivery
    elif payment_method in ['Invoice', 'PayOnDelivery']:
        return 'PaymentOnDelivery'
    
    # Default case for unrecognized methods
    else:
        return 'Others'

In [5]:
def days_since_first_order(data, order_date_column, first_order_column):
    # Create a feature for the number of days since the first order
    data['days_since_first_order'] = (data[order_date_column] - data[first_order_column]).dt.days
    data.drop([first_order_column], axis=1, inplace=True)
    return data


def transform_datetime(data, column):
    # Handle Datetime columns
    data[column + '_day_of_week'] = data[column].dt.dayofweek
    data[column + '_day'] = data[column].dt.day
    data[column + '_month'] = data[column].dt.month
    data[column + '_year'] = data[column].dt.year
    data.drop([column], axis=1, inplace=True)
    return data

def date_transformations(data):
    data = days_since_first_order(data, 'order_date', 'first_order_datetime')
    data = transform_datetime(data, 'order_date')

    return data

In [6]:
merged_df = convert_dtypes(merged_df)
merged_df['payment_method'] = merged_df['payment_method'].apply(group_payment_methods)
merged_df = date_transformations(merged_df)
merged_df.drop(columns=['order_id', 'customer_id'], inplace=True)
X=merged_df.drop(columns=['is_fraud'])
y=merged_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from ExperimentTrackers import PhaseOneExperimentTracker

In [8]:
# class PhaseOneExperimentTracker(BaseExperimentTracker):
#     def run_experiments(self, experiment_combinations, X_train, y_train, X_test, y_test, numeric_columns, categorical_cols):
#         """Run experiments for Phase 1."""
        
#         for config in experiment_combinations:
#             run_id = self._generate_run_id(config)
            
#             # Skip if this configuration has already been run
#             if run_id in self.completed_runs:
#                 print(f"Skipping completed run: {run_id}")
#                 continue
            
#             print(f"Starting run: {run_id}")
            
#             try:
#                 with mlflow.start_run(run_name=run_id):
#                     # Add descriptive run tags
#                     mlflow.set_tag("model_type", config['models']['name'])
#                     mlflow.set_tag("scaler_type", config['scaler'].__class__.__name__)
#                     mlflow.set_tag("encoding_applied", str(config['encode']['apply']))
#                     mlflow.set_tag("dataset", "X_train")


#                     # Build preprocessing steps
#                     transformers = []
#                     if config['scaler']:
#                         transformers.append(('scaler', config['scaler'], numeric_columns))
                    
#                     if config['encode']['apply']:
#                         transformers.append(('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols))
#                     else:
#                         transformers.append(('drop_categorical', 'drop', categorical_cols))
                    
#                     preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
#                     pipeline = Pipeline(steps=[
#                         ('preprocessor', preprocessor),
#                         ('model', config['models']['instance']),
#                     ])
                    
#                     # Train the pipeline
#                     pipeline.fit(X_train, y_train.to_numpy().ravel())
#                     predictions = pipeline.predict(X_test)
#                     probabilities = pipeline.predict_proba(X_test)[:, 1]  # For metrics requiring probabilities

#                     # Evaluate metrics
#                     metrics = self.evaluate_metrics(y_test, predictions, probabilities)
                    
#                     # Log parameters, metrics, and model
#                     mlflow.log_params(config)
#                     mlflow.log_metrics(metrics)
#                     mlflow.sklearn.log_model(pipeline, "model", input_example=X_train.iloc[0:1])
                    
#                     # Mark this run as completed
#                     self.completed_runs.add(run_id)
#                     self._save_checkpoint()
                    
#                     print(f"Completed run: {run_id}")
            
#             except Exception as e:
#                 print(f"Error in run {run_id}: {str(e)}")
# #               continue

In [9]:
# Import necessary models
# Import XGBoost and LightGBM Classifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


# Define the search space
search_space = search_space = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), None],
    'encode': [{'apply': True, 'columns': ['categorical_col']}, {'apply': False}],
    'models': [
        # {'name': 'LogisticRegression', 'instance': LogisticRegression()},
        # {'name': 'RandomForest', 'instance': RandomForestClassifier()},
        # {'name': 'XGBoost', 'instance': XGBClassifier()},
        # {'name': 'LightGBM', 'instance': LGBMClassifier()},
        {'name': 'KNeighbors', 'instance': KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')},
        # {'name': 'GaussianNB', 'instance': GaussianNB()},
        # {'name': 'SVM_Linear', 'instance': LinearSVC()},
        # {'name': 'DecisionTree', 'instance': DecisionTreeClassifier()},
    ]
}

# Generate all combinations of the search space
keys, values = zip(*search_space.items())
experiment_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
categorical_cols = ['payment_method', 'country_code', 'collect_type']
numeric_columns = ['order_value', 'refund_value', 'num_items_ordered', 'num_orders_last_50days', 'num_cancelled_orders_last_50days', 'num_refund_orders_last_50days', 'num_associated_customers', 'total_payment_last_50days', 'days_since_first_order', 'order_date_day_of_week', 'order_date_day', 'order_date_month', 'order_date_year']

In [None]:
# Initialize the tracker
tracker = PhaseOneExperimentTracker("Testing_Preprocessing_Methods")

# Load checkpoint file
tracker.completed_runs

# Run experiments with checkpointing
tracker.run_experiments(
    experiment_combinations=experiment_combinations,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    numeric_columns=numeric_columns,
    categorical_cols=categorical_cols
)

Starting run: KN_Standard_Enc
