In [2]:
import math, re, json
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Optional, Dict, Tuple

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from joblib import dump

from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# ====== Project config ======
TARGET = "Approved Benefit Amount"

# Columns that leak post-decision info; exclude from features
LEAKY_COLUMNS = [
    TARGET,
    "Posted Date",
    "Agreed Tenant Settlement",
    "Agreed Tenant Settlement Date",
    "Collected Date",
    "Collected Amount",
    "Collection Processed Date",
    "Review Claim Adjudication",
    "Review Tenant Information",
    "Update YRIG Policy Info",
    "Open Collections",
    "PM Notification of Claim Received",
    "Send to Collections",
    "Audit Selection",
    "Approval Date",
]

CURRENCY_LIKE = ["Max Benefit", "Monthly Rent", "Amount of Claim", "Approved Benefit Amount"]
DATE_LIKE = [
    "Claim Date","Lease Start Date","Lease End Date",
    "Move-Out Date","Posted Date","Agreed Tenant Settlement Date",
    "Collected Date","Collection Processed Date",
]
ZIP_COLS = ["Lease Zip"]

# Date differences to compute (A - B) in days
DATE_DIFFS = [
    ("Claim Date", "Move-Out Date", "days_claim_minus_moveout"),
    ("Lease End Date", "Lease Start Date", "days_lease_duration"),
    ("Move-Out Date", "Lease Start Date", "days_moveout_minus_lease_start"),
]


In [3]:
df_raw = pd.read_csv("depositClaimsData.csv")
print(df_raw.shape)

(1244, 47)


In [4]:
def _to_datetime(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)

def _strip_currency(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return float(x)
    s = re.sub(r"[^0-9.\-]", "", str(x))  # keep digits, dot, minus

    # Guard: tokens that aren't valid numbers ('.', '-', '-.', '.-')
    if s in {"", ".", "-", "-.", ".-"}:
        return np.nan

    # Guard: multiple dots -> keep first, drop the rest (e.g., '1.234.56' -> '1.23456')
    if s.count(".") > 1:
        parts = s.split(".")
        s = parts[0] + "." + "".join(parts[1:])

    try:
        return float(s)
    except Exception:
        return np.nan

def _first5_zip(x) -> Optional[str]:
    if pd.isna(x): return np.nan
    m = re.search(r"(\d{5})", str(x))
    return m.group(1) if m else np.nan

def _safe_div(n, d):
    if pd.isna(n) or pd.isna(d) or d == 0: return np.nan
    return n / d

def _date_parts(prefix: str, s: pd.Series) -> pd.DataFrame:
    return pd.DataFrame({
        f"{prefix}_year": s.dt.year,
        f"{prefix}_month": s.dt.month,
        f"{prefix}_day": s.dt.day,
        f"{prefix}_dow": s.dt.dayofweek,
        f"{prefix}_is_month_start": s.dt.is_month_start.astype(float),
        f"{prefix}_is_month_end": s.dt.is_month_end.astype(float),
    })

def _yn_to_int(x):
    xs = str(x).strip().lower()
    if xs in ["yes","y","true","1"]: return 1
    if xs in ["no","n","false","0"]: return 0
    return np.nan


In [5]:
def preprocess_df(df: pd.DataFrame, target_col: str) -> Tuple[pd.DataFrame, pd.Series, Dict]:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    if target_col not in df.columns:
        raise ValueError(f"Target '{target_col}' not found. Columns: {list(df.columns)}")

    # Currency → float
    for c in CURRENCY_LIKE:
        if c in df.columns:
            df[c] = df[c].apply(_strip_currency)

    # Dates → datetime
    for c in DATE_LIKE:
        if c in df.columns:
            df[c] = _to_datetime(df[c])

    # ZIP → first 5
    for c in ZIP_COLS:
        if c in df.columns:
            df[c] = df[c].apply(_first5_zip).astype("object")

    # Clean objects (strip & treat empty as NaN)
    for c in df.select_dtypes(include="object").columns:
        df[c] = df[c].astype(str).str.strip().replace({"": np.nan, "nan": np.nan})

    # Date deltas
    for a,b,name in DATE_DIFFS:
        if a in df.columns and b in df.columns:
            df[name] = (df[a] - df[b]).dt.days

    # Calendar features (claim/moveout/lease_start)
    if "Claim Date" in df.columns:
        df = pd.concat([df, _date_parts("claim", df["Claim Date"])], axis=1)
    if "Move-Out Date" in df.columns:
        df = pd.concat([df, _date_parts("moveout", df["Move-Out Date"])], axis=1)
    if "Lease Start Date" in df.columns:
        df = pd.concat([df, _date_parts("lease_start", df["Lease Start Date"])], axis=1)

    # Ratios
    if {"Amount of Claim","Max Benefit"}.issubset(df.columns):
        df["claim_to_max_ratio"] = df.apply(lambda r: _safe_div(r["Amount of Claim"], r["Max Benefit"]), axis=1)
    if {"Amount of Claim","Monthly Rent"}.issubset(df.columns):
        df["claim_to_rent_ratio"] = df.apply(lambda r: _safe_div(r["Amount of Claim"], r["Monthly Rent"]), axis=1)

    # Tenants
    for col in ["Is there a 2nd Tenant?","Is there a 3rd Tenant?"]:
        if col in df.columns:
            df[col] = df[col].apply(_yn_to_int)

    df["num_tenants_reported"] = (
        (df["Is there a 2nd Tenant?"] if "Is there a 2nd Tenant?" in df.columns else 0).fillna(0) +
        (df["Is there a 3rd Tenant?"] if "Is there a 3rd Tenant?" in df.columns else 0).fillna(0) + 1
    )

    # Target
    y = df[target_col].astype(float).copy()

    # Drop leaky & explicit IDs
    to_drop = [c for c in LEAKY_COLUMNS if c in df.columns and c != target_col]
    for maybe_id in ["Tracking Number", "Group #", "Treaty #", "Policy"]:
        if maybe_id in df.columns: to_drop.append(maybe_id)

    df.drop(columns=to_drop + [target_col], inplace=True, errors="ignore")

    meta = {"dropped_columns": to_drop, "target": target_col}
    return df, y, meta


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

def build_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    numeric_selector = make_column_selector(dtype_include=np.number)
    categorical_selector = make_column_selector(dtype_include=object)

    numeric_pipe = Pipeline(steps=[
        ("impute", SimpleImputer(strategy="median")),
    ])

    categorical_pipe = Pipeline(steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_selector),
            ("cat", categorical_pipe, categorical_selector),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return pre


In [7]:
# Preprocess first (handles currency/dates), then drop bad targets
X_tmp, y_tmp, meta = preprocess_df(df_raw, TARGET)

mask = y_tmp.notna() & np.isfinite(y_tmp.astype(float))
X = X_tmp.loc[mask].reset_index(drop=True)
y = y_tmp.loc[mask].astype(float).reset_index(drop=True)

print("After dropping NaN target rows:", X.shape, y.shape)

# Rebuild the preprocessor on the filtered X
pre = build_preprocessor(X)


  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
  return pd.to_datetime(s, errors="coerce", infer_datetime_forma

After dropping NaN target rows: (1212, 52) (1212,)


In [8]:
from sklearn.model_selection import train_test_split

if len(X) >= 10:
    test_size = 0.2
elif len(X) >= 5:
    test_size = 0.33
elif len(X) >= 3:
    test_size = 0.5
else:
    test_size = 0.5  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)
len(X_train), len(X_val)

(969, 243)

In [26]:
from joblib import Memory
memory = Memory(location="./sk_cache", verbose=0)  # caches preprocessing across CV folds

from sklearn.pipeline import Pipeline

# --- XGBoost on GPU
from xgboost import XGBRegressor
# xgb = Pipeline(
#     [
#         ("prep", pre),
#         ("model", XGBRegressor(
#             n_estimators=1000,
#             learning_rate=0.05,
#             max_depth=6,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             reg_lambda=1.0,
#             random_state=42,
#             device="cuda",       # <-- GPU enable (v3.x)
#             tree_method="hist",  # <-- unified hist (not 'gpu_hist')
#             predictor="auto",
#             n_jobs=1,
#             verbosity=0,
#         ))
#     ],
#     memory=memory
# )
xgb = Pipeline([
    ("prep", pre),
    ("model", XGBRegressor(
        n_estimators=4000,
        learning_rate=0.015,
        max_depth=10,
        min_child_weight=12,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bynode=0.8,
        reg_lambda=8.0,
        reg_alpha=0.5,
        gamma=0.1,
        max_bin=512,             
        tree_method="hist",
        predictor="auto",
        n_jobs=16,
        random_state=42,
        verbosity=1
    ))
], memory=memory)


# --- CatBoost on GPU
from catboost import CatBoostRegressor
cat = Pipeline(
    [
        ("prep", pre),
        ("model", CatBoostRegressor(
            depth=6,
            learning_rate=0.05,
            n_estimators=1000,
            l2_leaf_reg=3.0,
            random_state=42,
            loss_function="RMSE",
            task_type="GPU",   # << GPU training
            devices="0",
            verbose=False,
            allow_writing_files=False
        ))
    ],
    memory=memory
)



In [27]:
import math, numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

def rmse_scorer(estimator, X, y):
    pred = estimator.predict(X)
    return -rmse(y, pred)  # sklearn maximizes scores

def choose_cv_splits(n_train: int):
    if n_train >= 5: return 5
    if n_train >= 3: return 2
    return None

def eval_model(name, pipe, X_train, y_train, X_val, y_val):
    n_train = len(X_train)
    cv_k = choose_cv_splits(n_train)
    cv_mean = cv_std = None
    if cv_k:
        kf = KFold(n_splits=cv_k, shuffle=True, random_state=42)
        cvs = cross_val_score(pipe, X_train, y_train, scoring=rmse_scorer, cv=kf, n_jobs=-1, error_score='raise')
        cv_mean, cv_std = float(np.mean(-cvs)), float(np.std(-cvs))
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_val)
    return {
        "name": name,
        "cv_rmse_mean": cv_mean,
        "cv_rmse_std": cv_std,
        "val_rmse": float(rmse(y_val, pred)),
        "val_mae": float(mean_absolute_error(y_val, pred)),
        "val_r2": float(r2_score(y_val, pred)),
    }

results = []
# results.append(eval_model("Lasso", lasso, X_train, y_train, X_val, y_val))
# results.append(eval_model("RandomForest", rf, X_train, y_train, X_val, y_val))
# results.append(eval_model("HGB (defaults)", hgb, X_train, y_train, X_val, y_val))
results.append(eval_model("XGB (defaults)", xgb, X_train, y_train, X_val, y_val))
# results.append(eval_model("CatBoost (defaults)", cat, X_train, y_train, X_val, y_val))
results

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[{'name': 'XGB (defaults)',
  'cv_rmse_mean': 490.77515675259053,
  'cv_rmse_std': 35.035234405296414,
  'val_rmse': 523.8864104810399,
  'val_mae': 292.1122234057988,
  'val_r2': 0.7790118660793585}]

In [11]:
##################################

In [21]:
###########################################