In [None]:
!pip -q install pandas numpy scikit-learn xgboost joblib matplotlib

In [None]:
import os, zipfile
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    confusion_matrix, roc_curve, precision_recall_curve,
    balanced_accuracy_score, matthews_corrcoef
)

from xgboost import XGBClassifier
import joblib

SEED = 42
np.random.seed(SEED)

WORKDIR = Path("/content")
DATA_DIR = WORKDIR / "data"
OUT_DIR  = WORKDIR / "outputs"
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

In [None]:
def print_head(df, title, n=5):
    print(f"\n===== {title} (shape={df.shape}) =====")
    display(df.head(n))

def plot_pr(y_true, y_proba, title):
    prec, rec, _ = precision_recall_curve(y_true, y_proba)
    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(title)
    plt.show()

def plot_roc(y_true, y_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.show()

def plot_confusion(y_true, y_proba, threshold, title):
    y_pred = (np.asarray(y_proba) >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    plt.figure()
    plt.imshow(cm)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks([0, 1], [0, 1])
    plt.yticks([0, 1], [0, 1])
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha="center", va="center")
    plt.show()

def evaluate_binary_plus(y_true, y_proba, threshold, name="model"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = (np.asarray(y_proba) >= threshold).astype(int)
    return {
        "model": name,
        "threshold": float(threshold),
        "pos_rate": float(y_true.mean()),
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
        "mcc": float(matthews_corrcoef(y_true, y_pred)),
        "roc_auc": float(roc_auc_score(y_true, y_proba)) if len(np.unique(y_true)) > 1 else np.nan,
        "pr_auc": float(average_precision_score(y_true, y_proba)) if len(np.unique(y_true)) > 1 else np.nan,
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }

def pick_threshold_max_f1(y_true, y_proba):
    prec, rec, thr = precision_recall_curve(y_true, y_proba)
    thr = np.append(thr, 1.0)
    f1s = (2 * prec * rec) / (prec + rec + 1e-12)
    return float(thr[int(np.argmax(f1s))])

def pick_threshold_for_min_recall(y_true, y_proba, min_recall=0.80):
    prec, rec, thr = precision_recall_curve(y_true, y_proba)
    thr = np.append(thr, 1.0)
    ok = np.where(rec >= min_recall)[0]
    if len(ok) == 0:
        return float(thr[np.argmax(rec)])
    best = ok[np.argmax(prec[ok])]
    return float(thr[best])

In [None]:
from google.colab import files

uploaded = files.upload()
for fn in uploaded.keys():
    print("Uploaded:", fn)

# Move uploaded zips/csvs into DATA_DIR
for f in WORKDIR.glob("*.zip"):
    tgt = DATA_DIR / f.name
    if not tgt.exists():
        f.rename(tgt)

for f in WORKDIR.glob("*.csv"):
    tgt = DATA_DIR / f.name
    if not tgt.exists():
        f.rename(tgt)

# Unzip all Freddie zips
for z in DATA_DIR.glob("*.zip"):
    print("Extracting:", z.name)
    with zipfile.ZipFile(z, "r") as zp:
        zp.extractall(DATA_DIR)

print("\nDATA_DIR files:")
for p in sorted(DATA_DIR.glob("*")):
    if p.is_file():
        print(" -", p.name)

In [None]:
def read_pipe(path):
    return pd.read_csv(path, sep="|", header=None, dtype=str, engine="python")

orig_files = sorted([p for p in DATA_DIR.glob("*") if p.is_file() and "orig" in p.name.lower() and p.suffix.lower() in [".txt", ".csv"]])
perf_files = sorted([p for p in DATA_DIR.glob("*") if p.is_file() and any(k in p.name.lower() for k in ["svcg", "servicing", "perf"]) and p.suffix.lower() in [".txt", ".csv"]])

if not orig_files or not perf_files:
    raise FileNotFoundError("Could not find Freddie orig/perf files. Ensure Freddie zips were uploaded and extracted.")

orig_raw = pd.concat([read_pipe(p) for p in orig_files], ignore_index=True)
perf_raw = pd.concat([read_pipe(p) for p in perf_files], ignore_index=True)

print("orig_raw:", orig_raw.shape, "perf_raw:", perf_raw.shape)

orig_raw.columns = [f"c{i}" for i in range(orig_raw.shape[1])]
perf_raw.columns = [f"c{i}" for i in range(perf_raw.shape[1])]

# Map fields by position (Freddie sample layout)
orig = orig_raw.rename(columns={"c19": "loan_sequence_number"}).copy()
perf = perf_raw.rename(columns={
    "c0": "loan_sequence_number",
    "c1": "monthly_reporting_period",
    "c3": "current_loan_delinquency_status",
    "c8": "zero_balance_code"
}).copy()

perf["monthly_reporting_period"] = pd.to_datetime(perf["monthly_reporting_period"], format="%Y%m", errors="coerce")
perf = perf[perf["monthly_reporting_period"].notna()].copy()

print_head(perf[["loan_sequence_number","monthly_reporting_period","current_loan_delinquency_status","zero_balance_code"]],
           "Freddie Perf - Key Columns")

In [None]:
def map_dq(x):
    if pd.isna(x): return np.nan
    x = str(x).strip()
    return int(x) if x.isdigit() else np.nan

def to_num(s):
    return pd.to_numeric(s, errors="coerce")

perf = perf.sort_values(["loan_sequence_number","monthly_reporting_period"]).copy()
perf["dq"] = perf["current_loan_delinquency_status"].apply(map_dq)
perf["zb"] = to_num(perf["zero_balance_code"])

perf["is_dq"] = (perf["dq"].fillna(0) >= 1).astype(int)
perf["is_serious_dq"] = (perf["dq"].fillna(0) >= 3).astype(int)
perf["is_zb_bad"] = perf["zb"].isin([3,6,9]).astype(int)

g = perf.groupby("loan_sequence_number", sort=False)
H = 6

future_serious = pd.concat([g["is_serious_dq"].shift(-k) for k in range(1, H+1)], axis=1)
future_zb      = pd.concat([g["is_zb_bad"].shift(-k) for k in range(1, H+1)], axis=1)
valid = future_serious.notna().all(axis=1) & future_zb.notna().all(axis=1)

perf["y_default_future_6m"] = (
    (future_serious.max(axis=1).fillna(0).astype(int)) |
    (future_zb.max(axis=1).fillna(0).astype(int))
).astype(int)

# Past-only engineered features
perf["dq_status_lag1"] = g["dq"].shift(1)
perf["roll_3_dq"] = g["is_dq"].rolling(3, min_periods=1).sum().reset_index(level=0, drop=True)
perf["roll_6_serious_dq"] = g["is_serious_dq"].rolling(6, min_periods=1).sum().reset_index(level=0, drop=True)

perf = perf[valid].copy()
perf = perf[perf["dq_status_lag1"].notna()].copy()

freddie_ds = perf.merge(orig[["loan_sequence_number"]], on="loan_sequence_number", how="left")
freddie_ds["year"] = freddie_ds["monthly_reporting_period"].dt.year

print("Freddie dataset:", freddie_ds.shape, "pos_rate=", freddie_ds["y_default_future_6m"].mean())

In [None]:
num_cols_f = ["dq_status_lag1","roll_3_dq","roll_6_serious_dq"]
cat_cols_f = []

train_f = freddie_ds[freddie_ds["year"].isin([2022, 2023, 2024])].copy()
test_f  = freddie_ds[freddie_ds["year"].isin([2025])].copy()

# If time-based exists, remove overlap loans from test
if len(train_f) > 0 and len(test_f) > 0:
    train_loans = set(train_f["loan_sequence_number"].unique())
    test_f = test_f[~test_f["loan_sequence_number"].isin(train_loans)].copy()

# Fallback if time-based test is empty
if len(test_f) == 0:
    print("WARNING: Time-based test split empty. Using LOAN-LEVEL split across full Freddie dataset.")
    all_loans = freddie_ds["loan_sequence_number"].unique()
    loans_tr, loans_tmp = train_test_split(all_loans, test_size=0.30, random_state=SEED)
    loans_val, loans_te = train_test_split(loans_tmp, test_size=0.50, random_state=SEED)

    train_f2 = freddie_ds[freddie_ds["loan_sequence_number"].isin(loans_tr)].copy()
    val_f    = freddie_ds[freddie_ds["loan_sequence_number"].isin(loans_val)].copy()
    test_f   = freddie_ds[freddie_ds["loan_sequence_number"].isin(loans_te)].copy()
else:
    train_loans = train_f["loan_sequence_number"].unique()
    loans_tr, loans_val = train_test_split(train_loans, test_size=0.20, random_state=SEED)
    train_f2 = train_f[train_f["loan_sequence_number"].isin(loans_tr)].copy()
    val_f    = train_f[train_f["loan_sequence_number"].isin(loans_val)].copy()

if len(train_f2)==0 or len(val_f)==0 or len(test_f)==0:
    raise ValueError(f"Bad Freddie splits: train={len(train_f2)} val={len(val_f)} test={len(test_f)}")

yf_tr   = train_f2["y_default_future_6m"].astype(int).values
yf_val  = val_f["y_default_future_6m"].astype(int).values
yf_test = test_f["y_default_future_6m"].astype(int).values

Xf_tr   = train_f2[num_cols_f + cat_cols_f].copy()
Xf_val  = val_f[num_cols_f + cat_cols_f].copy()
Xf_test = test_f[num_cols_f + cat_cols_f].copy()

print("Freddie splits:",
      "train", Xf_tr.shape, "val", Xf_val.shape, "test", Xf_test.shape,
      "pos_rate_test", yf_test.mean())

In [None]:
# Build transformers safely (skip empty branches)
transformers_f = []
transformers_f.append(("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols_f))

pre_f = ColumnTransformer(transformers=transformers_f, remainder="drop")

Xf_tr_t   = pre_f.fit_transform(Xf_tr)
Xf_val_t  = pre_f.transform(Xf_val)
Xf_test_t = pre_f.transform(Xf_test)

neg, pos = int((yf_tr==0).sum()), int((yf_tr==1).sum())
spw_f = neg / max(pos, 1)

model_f = XGBClassifier(
    n_estimators=5000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1,
    eval_metric="aucpr",
    scale_pos_weight=spw_f,
    tree_method="hist"
)

model_f.fit(Xf_tr_t, yf_tr, eval_set=[(Xf_val_t, yf_val)], verbose=False)

proba_f_val = model_f.predict_proba(Xf_val_t)[:, 1]
thr_f = pick_threshold_max_f1(yf_val, proba_f_val)

proba_f_test = model_f.predict_proba(Xf_test_t)[:, 1]
freddie_metrics = evaluate_binary_plus(yf_test, proba_f_test, thr_f, "Freddie_Default_XGB")
display(pd.DataFrame([freddie_metrics]))

plot_roc(yf_test, proba_f_test, "Freddie — ROC")
plot_pr(yf_test, proba_f_test, "Freddie — PR")
plot_confusion(yf_test, proba_f_test, thr_f, f"Freddie — Confusion (thr={thr_f:.3f})")

joblib.dump(
    {"pre": pre_f, "model": model_f, "num_cols": num_cols_f, "cat_cols": cat_cols_f, "threshold": thr_f},
    OUT_DIR / "freddie_default_model.joblib"
)
print("Saved Freddie model.")

In [None]:
from google.colab import files
import pandas as pd

# 1) Upload manually (user selects file)
uploaded = files.upload()  # pick fraud_oracle.csv from your computer

# 2) Move uploaded CSVs into DATA_DIR
for fn in uploaded.keys():
    if fn.lower().endswith(".csv"):
        src = WORKDIR / fn
        dst = DATA_DIR / fn
        if src.exists():
            # overwrite if already exists
            if dst.exists():
                dst.unlink()
            src.rename(dst)

# 3) Find CSVs in DATA_DIR and choose fraud_oracle.csv if present
csvs = list(DATA_DIR.glob("*.csv"))
if not csvs:
    raise FileNotFoundError("No CSV found in DATA_DIR after upload.")

claims_path = None
for p in csvs:
    if p.name.lower() == "fraud_oracle.csv":
        claims_path = p
        break
if claims_path is None:
    # fallback: take the most recently modified csv
    claims_path = max(csvs, key=lambda p: p.stat().st_mtime)

# 4) Load
claims = pd.read_csv(claims_path)
print("Using claims file:", claims_path.name)
print("Claims shape:", claims.shape)
display(claims.head(5))

In [None]:
LABEL_COL = "FraudFound_P"
if LABEL_COL not in claims.columns:
    raise ValueError(f"Claims label '{LABEL_COL}' not found. Columns: {list(claims.columns)}")

yc = claims[LABEL_COL].astype(int).values

DROP_ALWAYS = [LABEL_COL]
if "PolicyNumber" in claims.columns:
    DROP_ALWAYS.append("PolicyNumber")

Xc_all = claims.drop(columns=DROP_ALWAYS, errors="ignore")

cat_cols_c = Xc_all.select_dtypes(include=["object"]).columns.tolist()
num_cols_c = Xc_all.select_dtypes(include=[np.number]).columns.tolist()

Xc = Xc_all[num_cols_c + cat_cols_c].copy()

print("Claims pos_rate:", yc.mean())
print("Num cols:", len(num_cols_c), "Cat cols:", len(cat_cols_c))

In [None]:
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.20, random_state=SEED, stratify=yc
)
Xc_tr, Xc_val, yc_tr, yc_val = train_test_split(
    Xc_train, yc_train, test_size=0.20, random_state=SEED, stratify=yc_train
)

print("Claims splits:",
      "train", Xc_tr.shape, "val", Xc_val.shape, "test", Xc_test.shape,
      "pos_rate_test", yc_test.mean())

In [None]:
transformers_c = []
if len(num_cols_c) > 0:
    transformers_c.append(("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols_c))
if len(cat_cols_c) > 0:
    transformers_c.append(("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                                           ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_c))

pre_c = ColumnTransformer(transformers=transformers_c, remainder="drop")

Xc_tr_t   = pre_c.fit_transform(Xc_tr)
Xc_val_t  = pre_c.transform(Xc_val)
Xc_test_t = pre_c.transform(Xc_test)

neg, pos = int((yc_tr==0).sum()), int((yc_tr==1).sum())
spw_c = neg / max(pos, 1)

model_c = XGBClassifier(
    n_estimators=2500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1,
    eval_metric="aucpr",
    scale_pos_weight=spw_c,
    tree_method="hist"
)

model_c.fit(Xc_tr_t, yc_tr, eval_set=[(Xc_val_t, yc_val)], verbose=False)

proba_c_val = model_c.predict_proba(Xc_val_t)[:, 1]
thr_c = pick_threshold_max_f1(yc_val, proba_c_val)

proba_c_test = model_c.predict_proba(Xc_test_t)[:, 1]
claims_metrics = evaluate_binary_plus(yc_test, proba_c_test, thr_c, "Claims_Fraud_XGB")
display(pd.DataFrame([claims_metrics]))

plot_roc(yc_test, proba_c_test, "Claims — ROC")
plot_pr(yc_test, proba_c_test, "Claims — PR")
plot_confusion(yc_test, proba_c_test, thr_c, f"Claims — Confusion (thr={thr_c:.3f})")

joblib.dump(
    {"pre": pre_c, "model": model_c, "num_cols": num_cols_c, "cat_cols": cat_cols_c, "threshold": thr_c, "dropped": DROP_ALWAYS},
    OUT_DIR / "claim_fraud_oracle_model.joblib"
)
print("Saved Claims model.")

In [None]:
# Freddie scoring
freddie_scored = freddie_ds.copy()
Xf_all = freddie_scored[num_cols_f + cat_cols_f].copy()
Xf_all_t = pre_f.transform(Xf_all)
freddie_scored["p_default_future6m"] = model_f.predict_proba(Xf_all_t)[:, 1]

freddie_top = freddie_scored.sort_values("p_default_future6m", ascending=False).head(200).copy()
freddie_out = OUT_DIR / "freddie_top200_default_future6m_cases.csv"
freddie_top.to_csv(freddie_out, index=False)

# Claims scoring
claims_scored = claims.copy()
Xc_all2 = claims_scored.drop(columns=DROP_ALWAYS, errors="ignore")
Xc_all2 = Xc_all2[num_cols_c + cat_cols_c].copy()
Xc_all2_t = pre_c.transform(Xc_all2)
claims_scored["p_claim_suspicious"] = model_c.predict_proba(Xc_all2_t)[:, 1]

claims_top = claims_scored.sort_values("p_claim_suspicious", ascending=False).head(200).copy()
claims_out = OUT_DIR / "claims_top200_suspicious_cases.csv"
claims_top.to_csv(claims_out, index=False)

print("Saved outputs:")
print(" -", freddie_out)
print(" -", claims_out)

display(freddie_top[["loan_sequence_number","p_default_future6m","dq_status_lag1","roll_6_serious_dq"]].head(10))
display(claims_top[["p_claim_suspicious","FraudFound_P"]].head(10))