In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# === SETTINGS ===
USE_TUNING = True   # True = GridSearchCV, False = manual params
CV_FOLDS = 5
SCORING = ["precision", "recall", "f1", "roc_auc", "average_precision"]
REFIT = "f1" 

#csv_path = "provider_features_final.csv"
# csv_path = "../data/data_smote_resampled.csv"
csv_path = "../data/data_smoteenn_cleaned.csv"

df = pd.read_csv(csv_path, low_memory=False)
print(f"Loaded: {csv_path}, Shape: {df.shape}")

Loaded: ../data/data_smoteenn_cleaned.csv, Shape: (9128, 16)


In [11]:
possible_labels = ["_label", "PotentialFraud", "_label_raw", "label", "potentialfraud"]
label_col = next((c for c in possible_labels if c in df.columns), None)
if label_col is None:
    raise ValueError("Could not find a label column in the dataframe. Expected one of: " + ",".join(possible_labels))

print(f"Using label column: {label_col}")
print(f"Unique values: {df[label_col].unique()}")

if label_col in ["_label", "_label_raw", "label"]:
    df["_label_encoded"] = df[label_col].astype(str).str.strip().str.lower().map({"legit": 0, "fraud": 1})
elif label_col == "PotentialFraud" or label_col.lower() == "potentialfraud":
    df["_label_encoded"] = df[label_col].astype(str).str.strip().str.lower().map({"no": 0, "yes": 1})
else:
    df["_label_encoded"] = df[label_col].map({0: 0, 1: 1, "0": 0, "1": 1})

print(f"Label distribution:\n{df['_label_encoded'].value_counts()}")


drop_cols = ["PotentialFraud", "_label_raw", "_label", "Provider"]

X_full = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

y = df["_label_encoded"].astype(int)
X = X_full.drop(columns=["_label_encoded"]) if "_label_encoded" in X_full.columns else X_full

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train label distribution:\n{y_train.value_counts()}")

Using label column: _label
Unique values: ['legit' 'fraud']
Label distribution:
_label_encoded
0    4904
1    4224
Name: count, dtype: int64
Train: (7302, 15), Test: (1826, 15)
Train label distribution:
_label_encoded
0    3923
1    3379
Name: count, dtype: int64


In [15]:
id_cols = [c for c in X_train.columns if "id" in c.lower() or X_train[c].nunique() / len(X_train) > 0.99]
X_train_clean = X_train.drop(columns=id_cols, errors="ignore")
X_test_clean = X_test.drop(columns=id_cols, errors="ignore")

num_cols = X_train_clean.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                      ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
], remainder="drop")

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

Numeric: 15, Categorical: 0


In [None]:
# Logistic Regression

# C: regularization strength (smaller = stronger regularization)
LR_PARAMS = {"C": 0.5, "solver": "saga", "max_iter": 5000}

lr_pipe = Pipeline([("preproc", preprocessor), ("clf", LogisticRegression(solver="saga", max_iter=5000, random_state=42))])

if USE_TUNING:
    lr_search = GridSearchCV(lr_pipe, {"clf__C": [0.01, 0.1, 0.5, 1.0]}, cv=CV_FOLDS, scoring=SCORING, refit=REFIT, n_jobs=-1)
    lr_search.fit(X_train_clean, y_train)
    log_reg = lr_search.best_estimator_
    print(f"Best params: {lr_search.best_params_}")
else:
    log_reg = Pipeline([("preproc", preprocessor), ("clf", LogisticRegression(**LR_PARAMS, random_state=42))])
    log_reg.fit(X_train_clean, y_train)
    print("Logistic Regression trained with:", LR_PARAMS)

Best params: {'clf__C': 0.5}


In [None]:
# Decision Tree

# max_depth: max tree depth | min_samples_split: min samples to split | min_samples_leaf: min samples per leaf
DT_PARAMS = {"max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 10}

dt_pipe = Pipeline([("preproc", preprocessor), ("clf", DecisionTreeClassifier(random_state=42))])

if USE_TUNING:
    dt_search = GridSearchCV(dt_pipe, {"clf__max_depth": [5, 10, 15], "clf__min_samples_split": [10, 20], "clf__min_samples_leaf": [5, 10]}, cv=CV_FOLDS, scoring=SCORING, refit=REFIT, n_jobs=-1)
    dt_search.fit(X_train_clean, y_train)
    dt = dt_search.best_estimator_
    print(f"Best params: {dt_search.best_params_}")
else:
    dt = Pipeline([("preproc", preprocessor), ("clf", DecisionTreeClassifier(**DT_PARAMS, random_state=42))])
    dt.fit(X_train_clean, y_train)
    print("Decision Tree trained with:", DT_PARAMS)

Best params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 10}


In [None]:
# Random Forest

# n_estimators: number of trees | max_depth: max tree depth | min_samples_split: min samples to split
RF_PARAMS = {"n_estimators": 200, "max_depth": 15, "min_samples_split": 5}

rf_pipe = Pipeline([("preproc", preprocessor), ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))])

if USE_TUNING:
    rf_search = GridSearchCV(rf_pipe, {"clf__n_estimators": [100, 200], "clf__max_depth": [10, 15], "clf__min_samples_split": [5, 10]}, cv=CV_FOLDS, scoring=SCORING, refit=REFIT, n_jobs=-1)
    rf_search.fit(X_train_clean, y_train)
    rf = rf_search.best_estimator_
    print(f"Best params: {rf_search.best_params_}")
else:
    rf = Pipeline([("preproc", preprocessor), ("clf", RandomForestClassifier(**RF_PARAMS, random_state=42, n_jobs=-1))])
    rf.fit(X_train_clean, y_train)
    print("Random Forest trained with:", RF_PARAMS)

Best params: {'clf__max_depth': 15, 'clf__min_samples_split': 5, 'clf__n_estimators': 200}


In [None]:
# SVM

# C: regularization (higher = less regularization) | kernel: rbf or linear
SVM_PARAMS = {"C": 10.0, "kernel": "rbf"}

svm_pipe = Pipeline([("preproc", preprocessor), ("clf", SVC(probability=True, random_state=42))])

if USE_TUNING:
    svm_search = GridSearchCV(svm_pipe, {"clf__C": [1.0, 10.0], "clf__kernel": ["rbf", "linear"]}, cv=CV_FOLDS, scoring=SCORING, refit=REFIT, n_jobs=-1)
    svm_search.fit(X_train_clean, y_train)
    svm = svm_search.best_estimator_
    print(f"Best params: {svm_search.best_params_}")
else:
    svm = Pipeline([("preproc", preprocessor), ("clf", SVC(**SVM_PARAMS, probability=True, random_state=42))])
    svm.fit(X_train_clean, y_train)
    print("SVM trained with:", SVM_PARAMS)

Best params: {'clf__C': 10.0, 'clf__kernel': 'rbf'}


In [None]:
# Gradient Boosting

# n_estimators: number of boosting stages | learning_rate: shrinks contribution of each tree | max_depth: max tree depth
GB_PARAMS = {"n_estimators": 200, "learning_rate": 0.05, "max_depth": 5}

gb_pipe = Pipeline([("preproc", preprocessor), ("clf", GradientBoostingClassifier(random_state=42))])

if USE_TUNING:
    gb_search = GridSearchCV(gb_pipe, {"clf__n_estimators": [100, 200], "clf__learning_rate": [0.05, 0.1], "clf__max_depth": [3, 5]}, cv=CV_FOLDS, scoring=SCORING, refit=REFIT, n_jobs=-1)
    gb_search.fit(X_train_clean, y_train)
    gb = gb_search.best_estimator_
    print(f"Best params: {gb_search.best_params_}")
else:
    gb = Pipeline([("preproc", preprocessor), ("clf", GradientBoostingClassifier(**GB_PARAMS, random_state=42))])
    gb.fit(X_train_clean, y_train)
    print("Gradient Boosting trained with:", GB_PARAMS)

Best params: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 200}


In [21]:
#Export Models and Data for Evaluation Notebook
import joblib
import os

os.makedirs("../models", exist_ok=True)

# Save all trained models
joblib.dump(log_reg, "../models/log_reg.pkl")
joblib.dump(dt, "../models/dt.pkl")
joblib.dump(rf, "../models/rf.pkl")
joblib.dump(svm, "../models/svm.pkl")
joblib.dump(gb, "../models/gb.pkl")

# Save test and train data (train for CV in evaluation)
joblib.dump(X_test_clean, "../models/X_test_clean.pkl")
joblib.dump(y_test, "../models/y_test.pkl")
joblib.dump(X_train_clean, "../models/X_train_clean.pkl")
joblib.dump(y_train, "../models/y_train.pkl")

print("Models and data exported to ../models/")

Models and data exported to ../models/
