In [147]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (precision_score, recall_score, f1_score, roc_auc_score,
                             average_precision_score, precision_recall_curve, roc_curve,
                             confusion_matrix, classification_report)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


#csv_path = "provider_features_final.csv"
# csv_path = "../data/data_smote_resampled.csv"
csv_path = "../data/data_smoteenn_cleaned.csv"

# Load data
df = pd.read_csv(csv_path, low_memory=False)
with open("../data/class_weights.json") as f:
    class_weights = json.load(f)

print(f"Loaded: {csv_path}")
print(f"Shape: {df.shape}")

Loaded: ../data/data_smoteenn_cleaned.csv
Shape: (9128, 16)


In [148]:
possible_labels = ["_label", "PotentialFraud", "_label_raw", "label", "potentialfraud"]
label_col = next((c for c in possible_labels if c in df.columns), None)
if label_col is None:
    raise ValueError("Could not find a label column in the dataframe. Expected one of: " + ",".join(possible_labels))

print(f"Using label column: {label_col}")
print(f"Unique values: {df[label_col].unique()}")

if label_col in ["_label", "_label_raw", "label"]:
    df["_label_encoded"] = df[label_col].astype(str).str.strip().str.lower().map({"legit": 0, "fraud": 1})
elif label_col == "PotentialFraud" or label_col.lower() == "potentialfraud":
    df["_label_encoded"] = df[label_col].astype(str).str.strip().str.lower().map({"no": 0, "yes": 1})
else:
    df["_label_encoded"] = df[label_col].map({0: 0, 1: 1, "0": 0, "1": 1})

print(f"Label distribution:\n{df['_label_encoded'].value_counts()}")


drop_cols = ["PotentialFraud", "_label_raw", "_label", "Provider"]

X_full = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

y = df["_label_encoded"].astype(int)
X = X_full.drop(columns=["_label_encoded"]) if "_label_encoded" in X_full.columns else X_full

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train label distribution:\n{y_train.value_counts()}")

Using label column: _label
Unique values: ['legit' 'fraud']
Label distribution:
_label_encoded
0    4904
1    4224
Name: count, dtype: int64
Train: (7302, 15), Test: (1826, 15)
Train label distribution:
_label_encoded
0    3923
1    3379
Name: count, dtype: int64


In [149]:
id_cols = [c for c in X_train.columns if "id" in c.lower() or X_train[c].nunique() / len(X_train) > 0.99]
X_train_clean = X_train.drop(columns=id_cols, errors="ignore")
X_test_clean = X_test.drop(columns=id_cols, errors="ignore")

num_cols = X_train_clean.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                      ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
], remainder="drop")

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

Numeric: 15, Categorical: 0


In [150]:
def _extract_class_weight_map(cw_obj):
    try:
        if isinstance(cw_obj, dict):
            for candidate_key in ("balanced_weights", "cost_sensitive_weights"):
                if candidate_key in cw_obj and isinstance(cw_obj[candidate_key], dict):
                    try:
                        return {int(k): float(v) for k, v in cw_obj[candidate_key].items()}
                    except Exception:
                        pass
            numeric_map = {}
            for k, v in cw_obj.items():
                if str(k).lstrip('-').isdigit():
                    numeric_map[int(k)] = float(v)
            if numeric_map:
                return numeric_map
    except Exception:
        pass
    return "balanced"

class_weight_map = _extract_class_weight_map(class_weights)
print(f"Class weights: {class_weight_map}")

Class weights: {0: 0.5515905383360522, 1: 5.345849802371541}


In [151]:
#Logistic Regression
LR_PARAMS = {
    "C": 0.5,                    # Regularization strength (smaller = stronger regularization)
    "penalty": "l2",             # 'l1', 'l2', 'elasticnet', or None
    "solver": "saga",            # 'lbfgs', 'liblinear', 'saga', 'newton-cg'
    "max_iter": 5000,            # Maximum iterations
    "class_weight": "balanced",  # Use "balanced" for automatic adjustment
}

log_reg = Pipeline([
    ("preproc", preprocessor),
    ("clf", LogisticRegression(**LR_PARAMS, random_state=42))
])
log_reg.fit(X_train_clean, y_train)
print("Logistic Regression trained.")

Logistic Regression trained.


In [152]:
#Decision Tree

dt = Pipeline([
    ("preproc", preprocessor),
    ("clf", DecisionTreeClassifier(class_weight=class_weight_map, max_depth=10,
                                   min_samples_split=20, min_samples_leaf=10, random_state=42))
])
dt.fit(X_train_clean, y_train)
print("Decision Tree trained.")

Decision Tree trained.


In [153]:
#Random Forest

rf = Pipeline([
    ("preproc", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=300, class_weight=class_weight_map, random_state=42, n_jobs=-1))
])
rf.fit(X_train_clean, y_train)
print("Random Forest trained.")

Random Forest trained.


In [154]:
#SVM
SVM_PARAMS = {
    "C": 10.0,                   # Higher C = less regularization, tighter fit
    "kernel": "rbf",             # 'linear', 'poly', 'rbf', 'sigmoid'
    "gamma": "scale",            # 'scale', 'auto', or float value
    "degree": 3,                 # Only used for 'poly' kernel
    "class_weight": "balanced",  # Automatic class balancing
    "probability": True,         # Required for predict_proba
}

svm = Pipeline([
    ("preproc", preprocessor),
    ("clf", SVC(**SVM_PARAMS, random_state=42))
])
svm.fit(X_train_clean, y_train)
print("SVM trained.")

SVM trained.


In [155]:
#Gradient Boosting
GB_PARAMS = {
    "n_estimators": 300,         # More boosting stages for better learning
    "learning_rate": 0.05,       # Lower rate = more careful learning
    "max_depth": 5,              # Deeper trees to capture patterns
    "min_samples_split": 5,      # Min samples to split internal node
    "min_samples_leaf": 2,       # Min samples in leaf node
    "subsample": 0.8,            # Stochastic gradient boosting (reduces overfitting)
    "max_features": "sqrt",      # Feature subsampling
}

gb = Pipeline([
    ("preproc", preprocessor),
    ("clf", GradientBoostingClassifier(**GB_PARAMS, random_state=42))
])
gb.fit(X_train_clean, y_train)
print("Gradient Boosting trained.")

Gradient Boosting trained.


In [157]:
#Export Models and Test Data for Evaluation Notebook
import joblib
import os

os.makedirs("../models", exist_ok=True)

# Save all trained models
joblib.dump(log_reg, "../models/log_reg.pkl")
joblib.dump(dt, "../models/dt.pkl")
joblib.dump(rf, "../models/rf.pkl")
joblib.dump(svm, "../models/svm.pkl")
joblib.dump(gb, "../models/gb.pkl")

# Save test data
joblib.dump(X_test_clean, "../models/X_test_clean.pkl")
joblib.dump(y_test, "../models/y_test.pkl")

print("Models and test data exported to ../models/")

Models and test data exported to ../models/
