# Imports

In [9]:
import os

import dagshub
import mlflow
import joblib

from mlflow.models.signature import infer_signature
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Set some options...
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

cwd = os.getcwd()

for dirname, _, filenames in os.walk(f'{cwd}/input'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

train_df = pd.read_csv(f'{cwd}/input/train.csv')
test_df = pd.read_csv(f'{cwd}/input/test.csv')

/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_transaction.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/.DS_Store
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/transformed_test_df_XGBoost.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/transformed_test_df_LogisticRegression.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/sample_submission.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_transaction.csv


In [2]:
dagshub.init(repo_owner='TomC333', repo_name='IEEE-CIS-Fraud-Detection-ML', mlflow=True)

# Cleaning

In [10]:
with mlflow.start_run(run_name="RandomForest_Cleaning"):
    nunique = train_df.nunique()
    constant_cols = nunique[nunique <= 1].index.tolist()

    train_df.drop(columns=constant_cols, inplace=True)
    mlflow.log_metric("const_cols_dropped", len(constant_cols))

    target_col = 'isFraud'
    feature_cols = [col for col in train_df.columns if col != target_col]
    numeric_cols = train_df[feature_cols].select_dtypes(include='number').columns.tolist()
    cat_cols = train_df[feature_cols].select_dtypes(include='object').columns.tolist()

    num_imputer = SimpleImputer(strategy='median')
    cat_imputer = SimpleImputer(strategy='constant', fill_value="Unknown")

    train_df[numeric_cols] = num_imputer.fit_transform(train_df[numeric_cols])
    train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])

    mlflow.log_param("num_imputer", "median")
    mlflow.log_param("cat_imputer", "constant_unknown")
    mlflow.log_metric("num_features_filled", len(numeric_cols))
    mlflow.log_metric("cat_features_filled", len(cat_cols))

    # apply the same on test
    test_df.columns = test_df.columns.str.replace('-', '_')  
    test_df.drop(columns=[col for col in constant_cols if col in test_df.columns], inplace=True)

    numeric_cols_test = [col for col in numeric_cols if col in test_df.columns]
    cat_cols_test = [col for col in cat_cols if col in test_df.columns]

    test_df[numeric_cols_test] = num_imputer.transform(test_df[numeric_cols_test])
    test_df[cat_cols_test] = cat_imputer.transform(test_df[cat_cols_test])


🏃 View run RandomForest_Cleaning at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/9cdc8762b64246949cf7e4ef1b3b1fa4
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Engineering

In [11]:
with mlflow.start_run(run_name="RandomForest_Feature_Engineering"):
    dropped_train_columns = [col for col in train_df.columns if col not in test_df.columns and col != 'isFraud']
    dropped_test_columns = [col for col in test_df.columns if col not in train_df.columns]

    train_df.drop(columns=dropped_train_columns, inplace=True)
    test_df.drop(columns=dropped_test_columns, inplace=True)

    mlflow.log_param("train_dropped_columns", dropped_train_columns)
    mlflow.log_param("test_dropped_columns", dropped_test_columns)

    cat_cols = train_df.select_dtypes(include='object').columns

    for col in cat_cols:
        le = LabelEncoder()
        combined_values = pd.concat([
            train_df[col].astype(str),
            test_df[col].astype(str)
        ]).unique()

        le.fit(combined_values)
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

        mlflow.log_param(f"{col}_label_encoded", True)

    new_features_train = pd.DataFrame()
    new_features_test = pd.DataFrame()

    new_features_train['Transaction_hour'] = (train_df['TransactionDT'] / 3600).astype(int) % 24
    new_features_test['Transaction_hour'] = (test_df['TransactionDT'] / 3600).astype(int) % 24

    new_features_train['Transaction_dayofweek'] = ((train_df['TransactionDT'] / 86400).astype(int)) % 7
    new_features_test['Transaction_dayofweek'] = ((test_df['TransactionDT'] / 86400).astype(int)) % 7

    new_features_train['TransactionAmt_log'] = np.log1p(train_df['TransactionAmt'])
    new_features_test['TransactionAmt_log'] = np.log1p(test_df['TransactionAmt'])

    new_features_train['card12'] = train_df['card1'].astype(str) + '_' + train_df['card2'].astype(str)
    new_features_test['card12'] = test_df['card1'].astype(str) + '_' + test_df['card2'].astype(str)

    card12_le = LabelEncoder()
    combined_card12 = pd.concat([
        new_features_train['card12'],
        new_features_test['card12']
    ]).astype(str).unique()
    card12_le.fit(combined_card12)

    new_features_train['card12'] = card12_le.transform(new_features_train['card12'].astype(str))
    new_features_test['card12'] = card12_le.transform(new_features_test['card12'].astype(str))

    train_df = pd.concat([train_df, new_features_train], axis=1)
    test_df = pd.concat([test_df, new_features_test], axis=1)

    created_features = ['Transaction_hour', 'Transaction_dayofweek', 'TransactionAmt_log', 'card12']
    mlflow.log_param("created_features", created_features)

🏃 View run RandomForest_Feature_Engineering at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/bcb6516f6e73443f92493dbe64554cdf
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Selection

In [12]:
with mlflow.start_run(run_name="RandomForest_Feature_Selection"):
    X = train_df.drop(columns=['TransactionID', 'isFraud'])
    y = train_df['isFraud']

    X_test = test_df.drop(columns=['TransactionID'])

    temp_clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    temp_clf.fit(X, y)

    importances = pd.Series(temp_clf.feature_importances_, index=X.columns)
    selected_features = importances[importances > np.percentile(importances, 30)].index.tolist()  # top 70% features

    X_sel = X[selected_features]
    X_test_sel = X_test[selected_features]

    mlflow.log_param("features_selected", selected_features)
    transformed_test_df = X_test_sel.copy()
    transformed_test_df['TransactionID'] = test_df['TransactionID']

    transformed_test_df.to_csv(f'{cwd}/input/transformed_test_df_RandomForest.csv', index=False)

🏃 View run RandomForest_Feature_Selection at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/26bb2527d96f4c7d9a82997051a90907
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Optimization & Model Training

In [None]:
with mlflow.start_run(run_name="RandomForest_Training"):
    clf = RandomForestClassifier(random_state=42, n_jobs=-1)

    param_dist = {
        'n_estimators': [100, 200, 300], 
        'max_depth': [10, 20, None], 
        'min_samples_split': [2, 5], 
        'min_samples_leaf': [1, 2], 
        'max_features': ['sqrt', 'log2']  
    }

    search = RandomizedSearchCV(
        clf,
        param_distributions=param_dist,
        n_iter=10, 
        cv=3,  
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1,
        verbose=3  
    )
    
    search.fit(X_sel, y)  

    best_clf = search.best_estimator_

    mlflow.log_params(search.best_params_)

    y_pred = best_clf.predict(X_sel)
    y_prob = best_clf.predict_proba(X_sel)[:, 1]

    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_prob)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    mlflow.log_metric("train_accuracy", acc)
    mlflow.log_metric("train_auc", auc)
    mlflow.log_metric("train_precision", precision)
    mlflow.log_metric("train_recall", recall)
    mlflow.log_metric("train_f1_score", f1)

    mlflow.log_param("model_type", "RandomForestClassifier")

    model_path = f"{cwd}/models/random_forest.pkl"
    joblib.dump(best_clf, model_path)
    mlflow.log_artifact(model_path)

    print(f"Model saved to {model_path}")

    input_example = X_sel[:5]
    signature = infer_signature(X_sel, y_pred)

    mlflow.sklearn.log_model(
        sk_model=best_clf,
        artifact_path="RandomForest_model",
        signature=signature,
        input_example=input_example
    )


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 3/3] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.556 total time= 1.0min
[CV 1/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.876 total time= 6.4min
[CV 2/3] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.867 total time= 1.1min
[CV 2/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.905 total time= 6.5min
[CV 1/3] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.844 total time= 1.1min
[CV 3/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.477 total time= 6.5min
