# Imports

In [1]:
import os 

import dagshub
import mlflow
import joblib

from mlflow.models.signature import infer_signature
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Set some options...
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

cwd = os.getcwd()

for dirname, _, filenames in os.walk(f'{cwd}/input'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

train_df = pd.read_csv(f'{cwd}/input/train.csv')
test_df = pd.read_csv(f'{cwd}/input/test.csv')

/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_transaction.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/.DS_Store
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/transformed_test_df_LogisticRegression.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/sample_submission.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_transaction.csv


In [2]:
dagshub.init(repo_owner='TomC333', repo_name='IEEE-CIS-Fraud-Detection-ML', mlflow=True)

# Cleaning

In [3]:
with mlflow.start_run(run_name="XGBoost_Cleaning"):
    nunique = train_df.nunique()
    constant_cols = nunique[nunique <= 1].index.tolist()
    train_df.drop(columns=constant_cols, inplace=True)
    test_df.drop(columns=constant_cols, inplace=True)

    mlflow.log_metric("const_cols_dropped", len(constant_cols))

🏃 View run XGBoost_Cleaning at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/17a89d369631477e83b569f5e2564fda
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Engineering

In [4]:
with mlflow.start_run(run_name="XGBoost_Feature_Engineering"):
    dropped_columns = []
    for col in train_df.columns:
        if col not in test_df.columns and col != 'isFraud':
            dropped_columns.append(col)
            train_df.drop(columns=[col], inplace=True)

    for col in test_df.columns:
        if col not in train_df.columns:
            dropped_columns.append(col)
            test_df.drop(columns=[col], inplace=True)

    mlflow.log_param("columns_dropped", dropped_columns)

    cat_cols = train_df.select_dtypes(include='object').columns

    for col in cat_cols:
        le = LabelEncoder()
        combined_values = pd.concat([
            train_df[col].astype(str),
            test_df[col].astype(str)
        ]).unique()
        le.fit(combined_values)

        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

        mlflow.log_param(f"{col}_label_encoded", True)

    new_features_train = pd.DataFrame()
    new_features_test = pd.DataFrame()

    new_features_train['Transaction_hour'] = (train_df['TransactionDT'] / 3600).astype(int) % 24
    new_features_test['Transaction_hour'] = (test_df['TransactionDT'] / 3600).astype(int) % 24

    new_features_train['Transaction_dayofweek'] = ((train_df['TransactionDT'] / 86400).astype(int)) % 7
    new_features_test['Transaction_dayofweek'] = ((test_df['TransactionDT'] / 86400).astype(int)) % 7

    new_features_train['Log_TransactionAmt'] = np.log1p(train_df['TransactionAmt'])
    new_features_test['Log_TransactionAmt'] = np.log1p(test_df['TransactionAmt'])

    for col in ['card1', 'card2', 'addr1', 'addr2']:
        if col in train_df.columns:
            train_count = train_df[col].value_counts()
            test_count = test_df[col].value_counts()
            new_features_train[f'{col}_count'] = train_df[col].map(train_count)
            new_features_test[f'{col}_count'] = test_df[col].map(test_count)

    train_df = pd.concat([train_df, new_features_train], axis=1)
    test_df = pd.concat([test_df, new_features_test], axis=1)

    additional_features = list(new_features_train.columns)
    mlflow.log_param("created_features", additional_features)

🏃 View run XGBoost_Feature_Engineering at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/17bcb5593caf476ca4fd7c3d2d9267fc
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Selection

In [10]:
import xgboost as xgb

from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectFromModel


with mlflow.start_run(run_name="XGBoost_Feature_Selection"):
    X = train_df.drop(columns=['TransactionID', 'isFraud'])
    y = train_df['isFraud']

    X_test = test_df.drop(columns=['TransactionID'])

    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    mlflow.log_param("scaler", "RobustScaler")

    xgb_selector = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1)
    xgb_selector.fit(X_scaled, y)

    selection = SelectFromModel(xgb_selector, threshold="median", prefit=True)
    X_selected = selection.transform(X_scaled)
    X_test_selected = selection.transform(X_test_scaled)

    mlflow.log_param("features_after_selection", X_selected.shape[1])

    transformed_test_df = pd.DataFrame(X_test_selected)
    transformed_test_df['TransactionID'] = test_df['TransactionID']
    transformed_test_df.to_csv(f'{cwd}/input/transformed_test_df_XGBoost.csv', index=False)

🏃 View run XGBoost_Feature_Selection at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/02609b18e6884d5d867946ca5066948c
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Optimization & Model Training

In [14]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "n_jobs": -1
    }
    
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_selected, y)
    preds = clf.predict_proba(X_selected)[:, 1]
    auc = roc_auc_score(y, preds)
    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

best_params = study.best_params

with mlflow.start_run(run_name="XGBoost_Training"):
    clf = xgb.XGBClassifier(**best_params, random_state=42, n_jobs=-1)
    clf.fit(X_selected, y)

    y_pred = clf.predict(X_selected)
    y_prob = clf.predict_proba(X_selected)[:, 1]

    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_prob)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    mlflow.log_metric("train_accuracy", acc)
    mlflow.log_metric("train_auc", auc)
    mlflow.log_metric("train_precision", precision)
    mlflow.log_metric("train_recall", recall)
    mlflow.log_metric("train_f1_score", f1)

    mlflow.log_params(best_params)

    model_path = f"{cwd}/models/xgboost_model.pkl"
    joblib.dump(clf, model_path)
    mlflow.log_artifact(model_path)

    input_example = X_selected[:5]
    signature = infer_signature(X_selected, y_pred)

    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="XGBoost_model",
        signature=signature,
        input_example=input_example
    )

    print(f"XGBoost model saved to {model_path}")

[I 2025-04-27 15:04:07,511] A new study created in memory with name: no-name-a85acf6c-463a-46ce-8288-9432806e667b
[I 2025-04-27 15:04:13,788] Trial 0 finished with value: 0.9562061409674016 and parameters: {'n_estimators': 448, 'max_depth': 4, 'learning_rate': 0.20989765731395552, 'subsample': 0.5506898469643743, 'colsample_bytree': 0.8133545450179274, 'gamma': 4.891862349076971, 'reg_alpha': 0.5704796768415071, 'reg_lambda': 4.269164315116835}. Best is trial 0 with value: 0.9562061409674016.
[I 2025-04-27 15:04:19,107] Trial 1 finished with value: 0.971426243824122 and parameters: {'n_estimators': 297, 'max_depth': 5, 'learning_rate': 0.2973212071853808, 'subsample': 0.6670893392354104, 'colsample_bytree': 0.5346874919778014, 'gamma': 0.068087265542493, 'reg_alpha': 4.385667486720849, 'reg_lambda': 1.687017988438924}. Best is trial 1 with value: 0.971426243824122.
[I 2025-04-27 15:04:30,096] Trial 2 finished with value: 0.9742013369888054 and parameters: {'n_estimators': 823, 'max_dep

XGBoost model saved to /Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/models/xgboost_model.pkl
🏃 View run XGBoost_Training at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/6e1bbd52166f44c98a4d340b51bc3a42
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0
