In [10]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd
import joblib

# --- 1) Robust evaluation helper that handles single-column predict_proba ---
def safe_predict_proba(pipe, X):
    """
    Return probability of the positive class (1) robustly even if predict_proba returns only 1 column.
    """
    # try normal predict_proba
    if hasattr(pipe, "predict_proba"):
        probs = pipe.predict_proba(X)
        if probs.ndim == 2 and probs.shape[1] == 2:
            # standard binary case: second col is prob for class 1
            return probs[:, 1]
        elif probs.ndim == 2 and probs.shape[1] == 1:
            # only one column present: need to inspect classes_
            clf = pipe.named_steps.get('clf', None)
            if clf is not None and hasattr(clf, "classes_"):
                cls = clf.classes_
                # if only class is 1 => probability of class1 is 1.0 for all predicted
                if len(cls) == 1:
                    single_class = cls[0]
                    if single_class == 1:
                        return np.ones(len(X))
                    else:
                        return np.zeros(len(X))
            # fallback: return the single column as-is (assume it's prob(1))
            return probs.ravel()
        else:
            # unexpected shape
            return np.zeros(len(X))
    else:
        # no predict_proba available
        preds = pipe.predict(X)
        return preds.astype(float)  # 0/1

def robust_eval(pipe, Xv, yv, name="model"):
    preds = pipe.predict(Xv)
    probs = safe_predict_proba(pipe, Xv)
    print(f"\n=== {name} evaluation ===")
    print("Classification report:")
    print(classification_report(yv, preds, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(yv, preds))
    # compute ROC AUC only if probabilities vary
    if np.unique(probs).shape[0] > 1:
        try:
            print("ROC AUC:", round(roc_auc_score(yv, probs), 4))
        except Exception as e:
            print("ROC AUC computation failed:", e)
    else:
        print("ROC AUC: cannot compute (probabilities are constant).")

# --- 2) Inspect the fitted GridSearch model you used (best_rf variable) ---
# If you saved the best grid into 'rf_grid' or 'best_rf'
# Replace names below if different (e.g., rf_grid, rnd, etc.)
try:
    # try to retrieve best_rf from current namespace (GridSearch earlier)
    best_rf = rf_grid.best_estimator_
    print("Using rf_grid.best_estimator_ from workspace.")
except Exception:
    try:
        best_rf = best_rf  # if already defined
    except Exception:
        best_rf = None

if best_rf is None:
    print("No 'best_rf' found in workspace. Load the tuned model file if saved, e.g.:")
    print("best_rf = joblib.load(r'C:\\BFSI\\fraud_models\\random_forest_tuned.pkl')")
else:
    # show classes known to the underlying classifier
    clf = best_rf.named_steps.get('clf', None)
    if clf is not None and hasattr(clf, "classes_"):
        print("Underlying classifier classes_:", clf.classes_)
    else:
        print("Underlying classifier has no classes_ attribute.")

    # quick check: how many unique labels in training set used by grid?
    print("y_train value counts (current variable):")
    try:
        print(y_train.value_counts().to_dict())
    except Exception:
        print("y_train not present in scope.")

# --- 3) Run robust evaluation on validation and test sets ---
# Use the objects X_val, y_val, X_test, y_test that should already be in workspace
if 'best_rf' in globals() and best_rf is not None:
    robust_eval(best_rf, X_val, y_val, name="RandomForest (best)")
    robust_eval(best_rf, X_test, y_test, name="RandomForest (best)")
else:
    print("best_rf not available: skip evaluation. Load model then call robust_eval().")

# --- 4) Debug: check for features that perfectly separate classes (simple rule-based check) ---
# This helps detect leakage: if a single feature perfectly separates fraud vs non-fraud,
# model can achieve perfect CV F1.
def perfect_predictors(X, y, top_n=10):
    """Return features that perfectly or near-perfectly separate classes."""
    out = []
    for c in X.columns:
        try:
            # skip numeric floats with many unique values
            uniques = X[c].nunique(dropna=False)
            if uniques <= 100:  # only test low-to-medium cardinality
                grouped = pd.crosstab(X[c], y)
                # if any row has only one class and count > 0 that's suspicious
                for idx, row in grouped.iterrows():
                    if (row == 0).any():
                        # compute separation ratio
                        total = row.sum()
                        max_count = row.max()
                        ratio = max_count / total
                        if ratio == 1.0:
                            out.append((c, idx, total, ratio))
        except Exception:
            continue
    return out

print("\nChecking for perfect single-feature predictors on training data (this is expensive for many features):")
try:
    pp = perfect_predictors(X_train, y_train)
    if pp:
        print("Found suspicious perfect predictors (feature, value, count, ratio):")
        for i in pp[:20]:
            print(i)
    else:
        print("No obvious perfect single-value predictors found (limited check).")
except Exception as e:
    print("Perfect predictors check failed:", e)

# --- 5) If suspicious perfect score persists: suggestions ---
print("""
If you see that the classifier only knows one class (classes_ length == 1), or CV f1 == 1.0:
 - Check that y_train actually contains both 0 and 1 (print y_train.value_counts()).
 - Check kernel/data leakage: ensure no feature equals the target or uniquely identifies fraud (e.g., 'Transaction_Status' may directly encode 'Fraud' or 'Reversed').
 - Remove or mask any features that leak the label (drop features and retry).
 - Ensure StratifiedKFold was used in GridSearchCV so folds contain positives/negatives.
 - As a quick fix, reduce the preprocessor to numeric-only and run GridSearch to see if perfect score disappears.
""")


Using rf_grid.best_estimator_ from workspace.
Underlying classifier classes_: [1]
y_train value counts (current variable):
{1: 40000}

=== RandomForest (best) evaluation ===
Classification report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      5000

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000

Confusion matrix:
[[5000]]
ROC AUC: cannot compute (probabilities are constant).

=== RandomForest (best) evaluation ===
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     50000
           1       0.09      1.00      0.17      5000

    accuracy                           0.09     55000
   macro avg       0.05      0.50      0.08     55000
weighted avg       0.01      0.09      0.02     55000

Confusion matrix:
[[    0 50000]
 [    0  5000]]
ROC AUC: c

In [11]:
# Robust resplit and quick check cell
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

BASE = Path(r"C:\BFSI")
SPLIT_DIR = BASE / "model_splits"
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

# Ensure df is present and timestamp exists
if 'df' not in globals():
    df_path = BASE / "card_fraud.csv"
    if not df_path.exists():
        raise RuntimeError("df not in memory and card_fraud.csv not found at C:\\BFSI.")
    df = pd.read_csv(df_path)

if "Transaction_Timestamp" not in df.columns:
    # try to create it
    if "Transaction_Date" in df.columns and "Transaction_Time" in df.columns:
        df["Transaction_Timestamp"] = pd.to_datetime(df["Transaction_Date"].astype(str) + " " + df["Transaction_Time"].astype(str), errors="coerce")
    elif "Transaction_Date" in df.columns:
        df["Transaction_Timestamp"] = pd.to_datetime(df["Transaction_Date"], errors="coerce")
    else:
        candidates = [c for c in df.columns if any(k in c.lower() for k in ["timestamp","datetime","date_time","time"])]
        parsed = False
        for c in candidates:
            df["Transaction_Timestamp"] = pd.to_datetime(df[c], errors="coerce")
            if df["Transaction_Timestamp"].notna().sum()>0:
                parsed = True
                break
        if not parsed:
            raise RuntimeError("Cannot parse Transaction_Timestamp. Fix timestamp column first.")

if "isFraud" not in df.columns:
    raise RuntimeError("Target column isFraud not found in df.")

# Sort by timestamp
df_sorted = df.sort_values("Transaction_Timestamp").reset_index(drop=True)

# Make temporal test (last 10% by time)
cut_test = df_sorted["Transaction_Timestamp"].quantile(0.90)
test_df = df_sorted[df_sorted["Transaction_Timestamp"] > cut_test].copy()
trainval_df = df_sorted[df_sorted["Transaction_Timestamp"] <= cut_test].copy()

print("Initial split sizes (trainval/test):", trainval_df.shape, test_df.shape)
print("Fraud counts (trainval/test):", trainval_df["isFraud"].sum(), test_df["isFraud"].sum())

# If test set has zero positives (rare), we will fallback to stratified full-split later.
# Now try stratified train/val on trainval_df
y_tv = trainval_df["isFraud"].astype(int)
unique_classes = sorted(y_tv.unique())
print("Classes present in trainval:", unique_classes)

# compute val fraction relative to trainval to get ~10% of original as val: val_frac_rel = 0.10/0.90
val_frac_rel = 0.10 / 0.90

# Helper to perform and validate stratified split
def try_stratified_split(df_source):
    X_tv = df_source.drop(columns=["isFraud"], errors="ignore")
    y_tv = df_source["isFraud"].astype(int)
    # drop id columns for splitting to avoid using them as features (but keep them not required)
    id_cols = [c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID"] if c in X_tv.columns]
    if id_cols:
        X_tv_for_split = X_tv.drop(columns=id_cols)
    else:
        X_tv_for_split = X_tv.copy()
    # If either class missing, cannot stratify
    if len(y_tv.unique()) < 2:
        return None, None, None, None
    X_tr, X_v, y_tr, y_v = train_test_split(
        X_tv_for_split, y_tv, test_size=val_frac_rel, stratify=y_tv, random_state=42
    )
    return X_tr, X_v, y_tr, y_v

# Try stratified on trainval_df
X_train, X_val, y_train, y_val = try_stratified_split(trainval_df)

if X_train is None:
    print("Stratified split on temporal trainval failed (only one class present). Falling back to global stratified split.")
    # Fallback: do stratified split over entire df (sacrifices strict temporal holdout)
    X_all = df_sorted.drop(columns=[c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID","isFraud"] if c in df_sorted.columns], errors="ignore")
    y_all = df_sorted["isFraud"].astype(int)
    if len(y_all.unique()) < 2:
        raise RuntimeError("Entire dataset contains only one class; cannot train a classifier.")
    # Primary: create test as before if it has both classes, else stratified on whole dataset
    if len(test_df["isFraud"].unique()) > 1:
        # keep temporal test, but to get trainval we will take stratified sample from entire dataset excluding test indices
        trainval_indices = df_sorted.index[df_sorted["Transaction_Timestamp"] <= cut_test].tolist()
        X_trainval = df_sorted.loc[trainval_indices].drop(columns=[c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID","isFraud"] if c in df_sorted.columns], errors="ignore")
        y_trainval = df_sorted.loc[trainval_indices,"isFraud"].astype(int)
        # now stratified split on these
        X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=val_frac_rel, stratify=y_trainval, random_state=42)
    else:
        # test doesn't have both classes; do full stratified split
        X_train, X_test2, y_train, y_test2 = train_test_split(X_all, y_all, test_size=0.20, stratify=y_all, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.111111, stratify=y_train, random_state=42)
        # overwrite test_df with stratified test (not temporal)
        X_test = X_test2
        y_test = y_test2
        print("Fallback used: fully stratified split across all data (non-temporal).")
else:
    # we got stratified splits from trainval_df, now set test as temporal test
    # Build X_test dropping ids and target
    id_cols_test = [c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID"] if c in test_df.columns]
    if id_cols_test:
        X_test = test_df.drop(columns=id_cols_test + ["isFraud"], errors="ignore")
    else:
        X_test = test_df.drop(columns=["isFraud"], errors="ignore")
    y_test = test_df["isFraud"].astype(int)

# Final sanity checks
print("\nFinal split counts:")
print("Train size / positive count:", X_train.shape, int(y_train.sum()))
print("Val   size / positive count:", X_val.shape, int(y_val.sum()))
print("Test  size / positive count:", X_test.shape, int(y_test.sum()))
print("Train classes present:", sorted(y_train.unique()))
print("Val classes present:", sorted(y_val.unique()))
print("Test classes present:", sorted(y_test.unique()))

if len(y_train.unique()) < 2 or len(y_val.unique()) < 2:
    print("\nWARNING: One of train/val still contains only a single class. Consider using full stratified split or adjusting cutoffs.")
else:
    # Save CSVs
    X_train.to_csv(SPLIT_DIR / "X_train_resplit.csv", index=False)
    y_train.to_csv(SPLIT_DIR / "y_train_resplit.csv", index=False)
    X_val.to_csv(SPLIT_DIR / "X_val_resplit.csv", index=False)
    y_val.to_csv(SPLIT_DIR / "y_val_resplit.csv", index=False)
    X_test.to_csv(SPLIT_DIR / "X_test_resplit.csv", index=False)
    y_test.to_csv(SPLIT_DIR / "y_test_resplit.csv", index=False)
    print("\nSaved new splits with suffix '_resplit.csv' to:", SPLIT_DIR)

# If you want me to immediately run training with these new splits, say 'train now' and I'll provide the cell.


Initial split sizes (trainval/test): (90000, 19) (10000, 19)
Fraud counts (trainval/test): 50000 0
Classes present in trainval: [np.int64(0), np.int64(1)]

Final split counts:
Train size / positive count: (80000, 14) 44444
Val   size / positive count: (10000, 14) 5556
Test  size / positive count: (10000, 14) 0
Train classes present: [np.int64(0), np.int64(1)]
Val classes present: [np.int64(0), np.int64(1)]
Test classes present: [np.int64(0)]

Saved new splits with suffix '_resplit.csv' to: C:\BFSI\model_splits


In [12]:
# Auto-adjust temporal test to include frauds & run compact hyperparameter tuning
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib, time, warnings
warnings.filterwarnings("ignore")

BASE = Path(r"C:\BFSI")
SPLIT_DIR = BASE / "model_splits"
MODEL_OUT = BASE / "fraud_models"
SPLIT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_OUT.mkdir(parents=True, exist_ok=True)

# Load or ensure df
if 'df' not in globals():
    df_path = BASE / "card_fraud.csv"
    if not df_path.exists():
        raise RuntimeError("df not in memory and card_fraud.csv not found at C:\\BFSI.")
    df = pd.read_csv(df_path)

# Ensure timestamp exists
if "Transaction_Timestamp" not in df.columns:
    if "Transaction_Date" in df.columns and "Transaction_Time" in df.columns:
        df["Transaction_Timestamp"] = pd.to_datetime(df["Transaction_Date"].astype(str) + " " + df["Transaction_Time"].astype(str), errors="coerce")
    elif "Transaction_Date" in df.columns:
        df["Transaction_Timestamp"] = pd.to_datetime(df["Transaction_Date"], errors="coerce")
    else:
        candidates = [c for c in df.columns if any(k in c.lower() for k in ["timestamp","datetime","date_time","time"])]
        parsed=False
        for c in candidates:
            df["Transaction_Timestamp"] = pd.to_datetime(df[c], errors="coerce")
            if df["Transaction_Timestamp"].notna().sum()>0:
                parsed=True
                break
        if not parsed:
            raise RuntimeError("Cannot parse Transaction_Timestamp.")

if "isFraud" not in df.columns:
    raise RuntimeError("Target 'isFraud' missing in df.")

# Sort dataset by timestamp
df_sorted = df.sort_values("Transaction_Timestamp").reset_index(drop=True)

# Compute total frauds and desired minimum in test
total_frauds = int(df_sorted["isFraud"].sum())
min_frauds = max(10, max(1, int(0.01 * total_frauds)))  # at least 10 or 1% of frauds (whichever larger)
print(f"Total frauds: {total_frauds}. Target min_frauds in test: {min_frauds}")

# Start from original 90%/10% cutoff and move cutoff earlier until test contains >= min_frauds positives
start_cut = df_sorted["Transaction_Timestamp"].quantile(0.90)
cut = start_cut
found = False

# We'll move the cutoff in steps by decreasing quantile in small increments
quantile = 0.90
while quantile > 0.01:
    cut = df_sorted["Transaction_Timestamp"].quantile(quantile)
    test_df = df_sorted[df_sorted["Transaction_Timestamp"] > cut]
    n_pos = int(test_df["isFraud"].sum())
    if n_pos >= min_frauds:
        found = True
        print(f"Found cutoff at quantile {quantile:.4f} -> test frauds: {n_pos}, test size: {len(test_df)}")
        break
    quantile -= 0.01  # step 1% earlier
if not found:
    # as a last resort, include everything after the earliest timestamp that yields at least one fraud
    pos_indices = df_sorted.index[df_sorted["isFraud"]==1].tolist()
    if pos_indices:
        # set cutoff to the timestamp just before the (total_frauds - min_frauds + 1)-th fraud if possible
        idx = max(0, pos_indices[-min_frauds]-1) if len(pos_indices) >= min_frauds else pos_indices[0]-1
        cut = df_sorted.loc[idx, "Transaction_Timestamp"] if idx >= 0 else df_sorted["Transaction_Timestamp"].min()
        test_df = df_sorted[df_sorted["Transaction_Timestamp"] > cut]
        print(f"No quantile found; using fallback cut -> test frauds: {int(test_df['isFraud'].sum())}, test size: {len(test_df)}")
    else:
        # no frauds at all
        print("No frauds in dataset at all; can't create a positive test set. Doing stratified full split.")
        # fallback to stratified full split below
        found = False

# If found True, create trainval from earlier rows and stratify train/val within it
if found:
    trainval_df = df_sorted[df_sorted["Transaction_Timestamp"] <= cut].copy()
    print("Trainval shape:", trainval_df.shape, "Test shape:", test_df.shape, "Test frauds:", int(test_df['isFraud'].sum()))
    # Now stratify train/val
    val_frac_rel = 0.10 / 0.90
    # Drop ids for stratify split
    id_cols = [c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID"] if c in trainval_df.columns]
    X_tv = trainval_df.drop(columns=id_cols + ["isFraud"], errors="ignore") if id_cols else trainval_df.drop(columns=["isFraud"])
    y_tv = trainval_df["isFraud"].astype(int)
    if len(y_tv.unique()) < 2:
        print("Trainval has only one class after cutoff; falling back to full stratified split.")
        found = False

if not found:
    # Full stratified split fallback (non-temporal)
    X_all = df_sorted.drop(columns=[c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID","isFraud"] if c in df_sorted.columns], errors="ignore")
    y_all = df_sorted["isFraud"].astype(int)
    if len(y_all.unique()) < 2:
        raise RuntimeError("Entire dataset contains only one class; cannot train.")
    # Do stratified 80/10/10
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.20, stratify=y_all, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.111111, stratify=y_train_all, random_state=42)
    X_test = X_test_all
    y_test = y_test_all
    print("Used full-stratified fallback. Sizes:", X_train.shape, X_val.shape, X_test.shape)
else:
    # perform stratified split on X_tv, y_tv
    X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=val_frac_rel, stratify=y_tv, random_state=42)
    # prepare X_test,y_test from test_df (drop ids and target)
    id_cols_test = [c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID"] if c in test_df.columns]
    X_test = test_df.drop(columns=id_cols_test + ["isFraud"], errors="ignore") if id_cols_test else test_df.drop(columns=["isFraud"], errors="ignore")
    y_test = test_df["isFraud"].astype(int)
    print("Final sizes after adjustment: X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)
    print("Fraud counts (train/val/test):", int(y_train.sum()), int(y_val.sum()), int(y_test.sum()))

# Save adjusted splits
X_train.to_csv(SPLIT_DIR / "X_train_adjusted.csv", index=False)
y_train.to_csv(SPLIT_DIR / "y_train_adjusted.csv", index=False)
X_val.to_csv(SPLIT_DIR / "X_val_adjusted.csv", index=False)
y_val.to_csv(SPLIT_DIR / "y_val_adjusted.csv", index=False)
X_test.to_csv(SPLIT_DIR / "X_test_adjusted.csv", index=False)
y_test.to_csv(SPLIT_DIR / "y_test_adjusted.csv", index=False)
print("Saved adjusted splits to:", SPLIT_DIR)

# -------------------------
# Compact hyperparameter tuning for RandomForest & LightGBM (smaller grid for speed)
# -------------------------
# Recompute numeric & categorical columns (drop timestamp/date/time columns if present)
for D in (X_train, X_val, X_test):
    if "Transaction_Timestamp" in D.columns:
        D["Transaction_Timestamp"] = pd.to_datetime(D["Transaction_Timestamp"], errors="coerce")
        D["txn_hour"] = D["Transaction_Timestamp"].dt.hour.fillna(-1).astype(int)
        D["txn_dayofweek"] = D["Transaction_Timestamp"].dt.day_name().fillna("Unknown")
        D.drop(columns=["Transaction_Timestamp"], inplace=True)
    if "Transaction_Date" in D.columns:
        D.drop(columns=["Transaction_Date"], inplace=True)
    if "Transaction_Time" in D.columns:
        D.drop(columns=["Transaction_Time"], inplace=True)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category','bool']).columns.tolist()
print("\nNum cols:", num_cols)
print("Cat cols:", cat_cols)

numeric_transform = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
categorical_transform = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
preprocessor = ColumnTransformer(transformers=[("num", numeric_transform, num_cols), ("cat", categorical_transform, cat_cols)], remainder="drop")

# RandomForest compact grid
rf_pipeline = Pipeline([("pre", preprocessor), ("clf", RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1))])
rf_param_grid_small = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [10, None],
    "clf__min_samples_leaf": [1, 2],
}
rf_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid_small, scoring="f1", cv=rf_cv, n_jobs=-1, verbose=2, refit=True)
t0 = time.time()
rf_grid.fit(X_train, y_train)
t1 = time.time()
print(f"RF tuning done in {(t1-t0)/60:.2f} minutes.")
best_rf = rf_grid.best_estimator_
print("Best RF params:", rf_grid.best_params_, "Best CV f1:", rf_grid.best_score_)

# Evaluate best RF
def safe_probs(pipe, X):
    if hasattr(pipe, "predict_proba"):
        p = pipe.predict_proba(X)
        if p.ndim==2 and p.shape[1]==2:
            return p[:,1]
        elif p.ndim==2 and p.shape[1]==1:
            # single-class case
            return np.full(len(X), p.ravel()[0])
        else:
            return np.zeros(len(X))
    else:
        return pipe.predict(X).astype(float)

print("\n=== RandomForest (tuned) VALIDATION ===")
preds_val = best_rf.predict(X_val)
print(classification_report(y_val, preds_val, zero_division=0))
probs_val = safe_probs(best_rf, X_val)
if np.unique(probs_val).shape[0] > 1:
    print("Val ROC AUC:", round(roc_auc_score(y_val, probs_val),4))
else:
    print("Val ROC AUC: constant probabilities (cannot compute)")

print("\n=== RandomForest (tuned) TEST ===")
preds_test = best_rf.predict(X_test)
print(classification_report(y_test, preds_test, zero_division=0))
probs_test = safe_probs(best_rf, X_test)
if np.unique(probs_test).shape[0] > 1:
    print("Test ROC AUC:", round(roc_auc_score(y_test, probs_test),4))
else:
    print("Test ROC AUC: constant probabilities (cannot compute)")

joblib.dump(best_rf, MODEL_OUT / "random_forest_tuned_adjusted.pkl")
pd.DataFrame(rf_grid.cv_results_).to_csv(SPLIT_DIR / "rf_grid_results_adjusted.csv", index=False)
print("Saved tuned RF and RF CV results.")

# LightGBM tuning small grid (if available)
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
    lgb_pipeline = Pipeline([("pre", preprocessor), ("clf", LGBMClassifier(class_weight="balanced", random_state=42))])
    lgb_param_grid_small = {
        "clf__n_estimators": [200, 400],
        "clf__num_leaves": [31, 63],
        "clf__learning_rate": [0.05, 0.1],
    }
    lgb_grid = GridSearchCV(lgb_pipeline, lgb_param_grid_small, scoring="f1", cv=rf_cv, n_jobs=-1, verbose=2, refit=True)
    t0 = time.time()
    lgb_grid.fit(X_train, y_train)
    t1 = time.time()
    print(f"LGB tuning done in {(t1-t0)/60:.2f} minutes.")
    best_lgb = lgb_grid.best_estimator_
    print("Best LGB params:", lgb_grid.best_params_, "Best CV f1:", lgb_grid.best_score_)
    # Evaluate
    print("\n=== LightGBM (tuned) VALIDATION ===")
    print(classification_report(y_val, best_lgb.predict(X_val), zero_division=0))
    print("=== LightGBM (tuned) TEST ===")
    print(classification_report(y_test, best_lgb.predict(X_test), zero_division=0))
    joblib.dump(best_lgb, MODEL_OUT / "lightgbm_tuned_adjusted.pkl")
    pd.DataFrame(lgb_grid.cv_results_).to_csv(SPLIT_DIR / "lgb_grid_results_adjusted.csv", index=False)
except Exception as e:
    print("LightGBM not available or failed to tune:", e)

# Save summary
summary = [{"model":"random_forest","best_cv_f1":rf_grid.best_score_,"best_params":rf_grid.best_params_}]
if 'lgb_grid' in globals():
    summary.append({"model":"lightgbm","best_cv_f1":lgb_grid.best_score_,"best_params":lgb_grid.best_params_})
pd.DataFrame(summary).to_csv(SPLIT_DIR / "grid_search_summary_adjusted.csv", index=False)
print("Done. Models saved to:", MODEL_OUT, "Grid results to:", SPLIT_DIR)


Total frauds: 50000. Target min_frauds in test: 500
Found cutoff at quantile 0.4900 -> test frauds: 1000, test size: 51000
Trainval shape: (49000, 19) Test shape: (51000, 19) Test frauds: 1000
Trainval has only one class after cutoff; falling back to full stratified split.
Used full-stratified fallback. Sizes: (71111, 14) (8889, 14) (20000, 14)
Saved adjusted splits to: C:\BFSI\model_splits

Num cols: ['Transaction_Amount', 'Previous_Transaction_Count', 'Distance_Between_Transactions_km', 'Time_Since_Last_Transaction_min', 'Transaction_Velocity', 'txn_hour']
Cat cols: ['Transaction_Location', 'Card_Type', 'Transaction_Currency', 'Transaction_Status', 'Authentication_Method', 'Transaction_Category', 'txn_dayofweek']
Fitting 3 folds for each of 8 candidates, totalling 24 fits
RF tuning done in 1.57 minutes.
Best RF params: {'clf__max_depth': None, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 100} Best CV f1: 0.5000739125871099

=== RandomForest (tuned) VALIDATION ===
              pr

In [13]:
# Inspect rf_grid and evaluate best estimator (robustly)
import joblib, numpy as np, pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, f1_score

# rf_grid should exist after GridSearchCV finished
best_rf = rf_grid.best_estimator_
print("RF best params:", rf_grid.best_params_)
print("RF best CV f1:", rf_grid.best_score_)

# helper to get prob of class 1 robustly
def safe_prob(pipe, X):
    if hasattr(pipe, "predict_proba"):
        p = pipe.predict_proba(X)
        if p.ndim == 2 and p.shape[1] == 2:
            return p[:,1]
        elif p.ndim == 2 and p.shape[1] == 1:
            # only one class learned; return constant
            return np.full(len(X), p.ravel()[0])
    return pipe.predict(X).astype(float)

# load adjusted splits (if not in memory)
SPLIT_DIR = r"C:\BFSI\model_splits"
X_val = pd.read_csv(SPLIT_DIR + "/X_val_adjusted.csv")
y_val = pd.read_csv(SPLIT_DIR + "/y_val_adjusted.csv").squeeze().astype(int)
X_test = pd.read_csv(SPLIT_DIR + "/X_test_adjusted.csv")
y_test = pd.read_csv(SPLIT_DIR + "/y_test_adjusted.csv").squeeze().astype(int)

# Evaluate with default threshold 0.5
preds_val = best_rf.predict(X_val)
preds_test = best_rf.predict(X_test)
probs_val = safe_prob(best_rf, X_val)
probs_test = safe_prob(best_rf, X_test)

print("Validation report (threshold 0.5):")
print(classification_report(y_val, preds_val, zero_division=0))
if np.unique(probs_val).size>1:
    try: print("Val ROC AUC:", round(roc_auc_score(y_val, probs_val),4))
    except: pass

print("Test report (threshold 0.5):")
print(classification_report(y_test, preds_test, zero_division=0))
if np.unique(probs_test).size>1:
    try: print("Test ROC AUC:", round(roc_auc_score(y_test, probs_test),4))
    except: pass

# Save the grid results summary
pd.DataFrame(rf_grid.cv_results_).to_csv(SPLIT_DIR + "/rf_grid_results_final.csv", index=False)
joblib.dump(best_rf, r"C:\BFSI\fraud_models\random_forest_tuned_final.pkl")
print("Saved best RF pipeline to C:\\BFSI\\fraud_models\\random_forest_tuned_final.pkl")


RF best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 100}
RF best CV f1: 0.5000739125871099


ValueError: columns are missing: {'txn_dayofweek', 'txn_hour'}