# Imports

In [1]:
import os 

import dagshub
import mlflow
import joblib

from mlflow.models.signature import infer_signature
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Set some options...
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

cwd = os.getcwd()

for dirname, _, filenames in os.walk(f'{cwd}/input'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

train_df = pd.read_csv(f'{cwd}/input/train.csv')
test_df = pd.read_csv(f'{cwd}/input/test.csv')

/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_transaction.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/.DS_Store
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/test_identity.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/transformed_test_df_LogisticRegression.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/sample_submission.csv
/Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/input/train_transaction.csv


In [2]:
dagshub.init(repo_owner='TomC333', repo_name='IEEE-CIS-Fraud-Detection-ML', mlflow=True)

# Cleaning

In [3]:
with mlflow.start_run(run_name="LogisticRegression_Cleaning"):
    # remove constant cols
    nunique = train_df.nunique()
    constant_cols = nunique[nunique <= 1].index.tolist()
    train_df.drop(columns=constant_cols, inplace=True)
    
    mlflow.log_metric("const_cols_dropped", len(constant_cols))
    
    # fill NaNs with SimpleImputer (mean for numeric, most_frequent for categorical)
    numeric_cols = train_df.select_dtypes(include='number').columns
    cat_cols = train_df.select_dtypes(include='object').columns

    num_imputer = SimpleImputer(strategy='mean')  
    cat_imputer = SimpleImputer(strategy='most_frequent') 

    train_df[numeric_cols] = num_imputer.fit_transform(train_df[numeric_cols])  
    train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])  
    
    mlflow.log_metric("num_features_filled", len(numeric_cols))
    mlflow.log_metric("cat_features_filled", len(cat_cols))  

    mlflow.log_param("num_imputer", "mean") 
    mlflow.log_param("cat_imputer", "most_frequent")

    
    # do the same for test dataframe
    numeric_cols = test_df.select_dtypes(include='number').columns
    cat_cols = test_df.select_dtypes(include='object').columns

    test_df.drop(columns=constant_cols, inplace=True)
    test_df[numeric_cols] = num_imputer.fit_transform(test_df[numeric_cols])
    test_df[cat_cols] = cat_imputer.fit_transform(test_df[cat_cols])  

🏃 View run LogisticRegression_Cleaning at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/a2c36967c0754da2a61c9fc9ecca4951
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Engineering

In [4]:
with mlflow.start_run(run_name="LogisticRegression_Feature_Engineering"):
    # Drop columns from train_df that are not in test_df, we can't use them for prediction :D
    dropped_columns = []
    for col in train_df.columns:
        if col not in test_df.columns and col != 'isFraud':  # avoid dropping 'isFraud' from train_df
            dropped_columns.append(col)
            train_df.drop(columns=[col], inplace=True)

    mlflow.log_param("train_dropped_columns", dropped_columns)

    # Drop columns from test_df that are not in train_df
    dropped_columns = []
    for col in test_df.columns:
        if col not in train_df.columns: 
            dropped_columns.append(col)
            test_df.drop(columns=[col], inplace=True)

    mlflow.log_param("test_dropped_columns", dropped_columns)

    cat_cols = train_df.select_dtypes(include='object').columns

    # for each categorical column, fit on combined train+test data to capture all categories
    for col in cat_cols:
        le = LabelEncoder()

        # combine train and test to capture all unique categories
        combined_values = pd.concat([
            train_df[col].astype(str),
            test_df[col].astype(str)
        ]).unique()

        le.fit(combined_values)

        # Transform original train and test columns
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

        mlflow.log_param(f"{col}_unique_values", len(combined_values))
        mlflow.log_param(f"{col}_label_encoded", True)
        

    # create some new features
    new_features_train = pd.DataFrame()
    new_features_test = pd.DataFrame()

    new_features_train['Transaction_hour'] = (train_df['TransactionDT'] / 3600).astype(int) % 24
    new_features_test['Transaction_hour'] = (test_df['TransactionDT'] / 3600).astype(int) % 24
    
    new_features_train['Transaction_dayofweek'] = ((train_df['TransactionDT'] / 86400).astype(int)) % 7
    new_features_test['Transaction_dayofweek'] = ((test_df['TransactionDT'] / 86400).astype(int)) % 7

    product_cd_count = train_df['ProductCD'].value_counts()
    product_cd_count_t = test_df['ProductCD'].value_counts()
    new_features_train['ProductCD_count'] = train_df['ProductCD'].map(product_cd_count)
    new_features_test['ProductCD_count'] = test_df['ProductCD'].map(product_cd_count_t)
    
    train_df = pd.concat([train_df, new_features_train], axis=1)
    test_df = pd.concat([test_df, new_features_test], axis=1)
    
    additional_features = ['Transaction_hour', 'Transaction_dayofweek', 'ProductCD_count']
    
    mlflow.log_param("created_features", additional_features)
    mlflow.log_param("feature_creation", "Log-transformed and Time-based features + Encoding strategies")

🏃 View run LogisticRegression_Feature_Engineering at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/ce26c8ac657947329064ff549d3723af
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Feature Selection

In [5]:
with mlflow.start_run(run_name="LogisticRegression_Feature_Selection"):
    X = train_df.drop(columns=['TransactionID', 'isFraud'])
    y = train_df['isFraud']
    
    # remove love variance features ( < 1% ), to avoid overfitting :D
    sel = VarianceThreshold(threshold=0.01)
    X_sel = sel.fit_transform(X)
    mlflow.log_param("features_after_variance_threshold", X_sel.shape[1]) 

    # apply the same to test dataframe
    X_test = test_df.drop(columns=['TransactionID'])
    X_test_sel = sel.transform(X_test)
    mlflow.log_param("features_after_variance_threshold_test", X_test_sel.shape[1])

    transformed_test_df = pd.DataFrame(X_test_sel, columns=X.columns[sel.get_support()])
    transformed_test_df['TransactionID'] = test_df['TransactionID']  # keep transaction id 
    transformed_test_df.to_csv(f'{cwd}/input/transformed_test_df_LogisticRegression.csv', index=False)

🏃 View run LogisticRegression_Feature_Selection at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/49e91f02fc1d41e5a14106f2653a591e
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0


# Model Training

In [6]:
with mlflow.start_run(run_name="LogisticRegression_Training"):
    clf = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)

    clf.fit(X_sel, y)
    
    # Predict on training set (since you don’t have labels for test)
    y_pred = clf.predict(X_sel)
    y_prob = clf.predict_proba(X_sel)[:, 1]

    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_prob)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    mlflow.log_metric("train_accuracy", acc)
    mlflow.log_metric("train_auc", auc)
    mlflow.log_metric("train_precision", precision)
    mlflow.log_metric("train_recall", recall)
    mlflow.log_metric("train_f1_score", f1)

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("solver", "liblinear")
    mlflow.log_param("max_iter", 1000)

    model_path = f"{cwd}/models/logistic_regression.pkl"
    joblib.dump(clf, model_path)
    mlflow.log_artifact(model_path)

    print(f"Model saved to {model_path}")
    
    input_example = X_sel[:5]
    signature = infer_signature(X_sel, y_pred)

    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="LogisticRegression_model",
        signature=signature,
        input_example=input_example
    )

Model saved to /Users/davitdadiani/Desktop/git/IEEE-CIS-Fraud-Detection-ML/models/logistic_regression.pkl
🏃 View run LogisticRegression_Training at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0/runs/69a0abeee58c49829536bf32f95f4c7c
🧪 View experiment at: https://dagshub.com/TomC333/IEEE-CIS-Fraud-Detection-ML.mlflow/#/experiments/0
