In [14]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder, KBinsDiscretizer, PowerTransformer,
    PolynomialFeatures, FunctionTransformer
)
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


SEED = 42
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
OUT_FILE = "submission_refactored.csv"

if not os.path.isfile(TRAIN_FILE):
    raise FileNotFoundError(TRAIN_FILE)


def safe_log_plus_one(series: pd.Series) -> pd.Series:
    """Numerically stable log1p transformer."""
    s = pd.to_numeric(series, errors="coerce")
    if s.isnull().all():
        return s
    mn = s.min(skipna=True)
    if pd.notna(mn) and mn < 0:
        s = s + abs(mn) + 1
    return np.log1p(s.fillna(0))


def blended_weight(y, a, b):
    d = a - b
    denom = np.dot(d, d)
    if denom == 0:
        return 0.5
    w = np.dot(d, (y - b)) / denom
    return float(np.clip(w, 0, 1))


train_raw = pd.read_csv(TRAIN_FILE)
test_raw = pd.read_csv(TEST_FILE) if os.path.exists(TEST_FILE) else None


def initial_cleanup(df: pd.DataFrame, train_mode=True) -> pd.DataFrame:
    df = df.copy()

    if train_mode and "RiskScore" in df:
        df = df[df["RiskScore"].between(0, 100)].copy()

    df.replace(-9999999.0, np.nan, inplace=True)

    if "ApplicationDate" in df:
        dt = pd.to_datetime(df["ApplicationDate"], errors="coerce")
        df["App_Year"] = dt.dt.year.fillna(0).astype(int)
        df["App_Month"] = dt.dt.month.fillna(0).astype(int)
        df["App_DayOfWeek"] = dt.dt.dayofweek.fillna(0).astype(int)

    eps = 1e-6

    if {"MonthlyLoanPayment", "MonthlyIncome"} <= set(df.columns):
        df["PaymentToIncomeRatio"] = df["MonthlyLoanPayment"] / (df["MonthlyIncome"] + eps)

    if {"LoanAmount", "AnnualIncome"} <= set(df.columns):
        df["LoanToIncomeRatio"] = df["LoanAmount"] / (df["AnnualIncome"] + eps)

    if {"TotalLiabilities", "TotalAssets"} <= set(df.columns):
        df["DebtToAssetsRatio"] = df["TotalLiabilities"] / (df["TotalAssets"] + eps)

    if {"SavingsAccountBalance", "LoanAmount"} <= set(df.columns):
        df["SavingsToLoanRatio"] = df["SavingsAccountBalance"] / (df["LoanAmount"] + eps)

    if {"LengthOfCreditHistory", "PaymentHistory"} <= set(df.columns):
        df["CreditHistoryInteraction"] = df["LengthOfCreditHistory"] * df["PaymentHistory"]

    if {"MonthlyIncome", "CreditScore"} <= set(df.columns):
        df["Income_x_CreditScore"] = df["MonthlyIncome"].fillna(0) * df["CreditScore"].fillna(0)

    clip_targets = [
        "MonthlyIncome","LoanAmount","AnnualIncome","SavingsAccountBalance",
        "TotalAssets","TotalLiabilities","MonthlyDebtPayments"
    ]
    for c in clip_targets:
        if c in df:
            lo, hi = df[c].quantile([0.01, 0.99])
            df[c] = df[c].clip(lo, hi)

    log_cols = [
        "MonthlyIncome","LoanAmount","SavingsAccountBalance","CheckingAccountBalance",
        "TotalAssets","TotalLiabilities","NetWorth","MonthlyDebtPayments"
    ]
    for c in log_cols:
        if c in df:
            df[c] = safe_log_plus_one(df[c])

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df


train = initial_cleanup(train_raw, True)
test = initial_cleanup(test_raw, False) if test_raw is not None else None


TARGET = "RiskScore"

num_raw = [
    'CreditScore','MonthlyIncome','BaseInterestRate','LoanAmount','LoanDuration',
    'DebtToIncomeRatio','NumberOfDependents','NumberOfOpenCreditLines','NumberOfCreditInquiries',
    'PaymentHistory','LengthOfCreditHistory','UtilityBillsPaymentHistory','MonthlyDebtPayments',
    'CreditCardUtilizationRate','InterestRate','TotalDebtToIncomeRatio','SavingsAccountBalance',
    'CheckingAccountBalance','TotalAssets','TotalLiabilities','NetWorth','JobTenure','Experience','Age',
    'BankruptcyHistory','PreviousLoanDefaults','PaymentToIncomeRatio','LoanToIncomeRatio','DebtToAssetsRatio',
    'SavingsToLoanRatio','CreditHistoryInteraction','Income_x_CreditScore',
    'App_Year','App_Month','App_DayOfWeek'
]

cat_raw = ["MaritalStatus","HomeOwnershipStatus","EmploymentStatus","EducationLevel","LoanPurpose"]
binned_candidates = ["Age", "CreditScore"]

nums = [c for c in num_raw if c in train]
cats = [c for c in cat_raw if c in train]
bnd = [c for c in binned_candidates if c in train]
nums_final = [c for c in nums if c not in bnd]
all_cols = nums_final + cats + bnd


def high_corr_drop(df, cols, threshold=0.95):
    tmp = df[cols].select_dtypes(include=[np.number]).copy()
    cmat = tmp.corr().abs()
    upper = cmat.where(np.triu(np.ones(cmat.shape), 1).astype(bool))
    dropped = [c for c in upper.columns if any(upper[c] > threshold)]
    return dropped

drop_cols = high_corr_drop(train, nums_final)
if drop_cols:
    nums_final = [x for x in nums_final if x not in drop_cols]
    all_cols = nums_final + cats + bnd


train = train.dropna(subset=[TARGET])
X = train[all_cols]
y = train[TARGET]

try:
    y_bins = pd.cut(y, bins=6, labels=False)
    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2,
                                              random_state=SEED, stratify=y_bins)
except:
    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=SEED)


numeric_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("pt", PowerTransformer(method="yeo-johnson"))
])

cat_pipe = Pipeline([
    ("imp", SimpleImputer(fill_value="Missing", strategy="constant")),
    ("oh", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

bin_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("kb", KBinsDiscretizer(n_bins=40, encode="onehot-dense", strategy="quantile"))
])

col_tf = ColumnTransformer([
    ("num", numeric_pipe, nums_final),
    ("cat", cat_pipe, cats),
    ("bin", bin_pipe, bnd)
])

feature_pipe = Pipeline([
    ("prep", col_tf),
    ("vt", VarianceThreshold(1e-5)),
    ("k1", SelectKBest(f_regression, k=min(1000, X_tr.shape[1]))),
    ("poly", PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)),
    ("k2", SelectKBest(f_regression, k=min(2000, 5000)))
])

Xt_tr = feature_pipe.fit_transform(X_tr, y_tr)
Xt_va = feature_pipe.transform(X_va)

print("Shapes:", Xt_tr.shape, Xt_va.shape)


prev_alpha = 43.28761281083057
grid = np.logspace(np.log10(prev_alpha/10), np.log10(prev_alpha*10), 21)

best = {"mse": 1e9}

for alpha in grid:
    try:
        model_r = Ridge(alpha=float(alpha)).fit(Xt_tr, y_tr)
        model_l = LinearRegression().fit(Xt_tr, y_tr)

        p_l = np.clip(model_l.predict(Xt_va), 0, 100)
        p_r = np.clip(model_r.predict(Xt_va), 0, 100)

        w_opt = blended_weight(y_va.values, p_l, p_r)
        w_min, w_max = max(0, w_opt - 0.05), min(1, w_opt + 0.05)

        if w_max <= w_min:
            weights = [w_opt]
        else:
            weights = np.arange(w_min, w_max + 1e-12, 0.001)

        best_local = (None, 1e9)
        for w in weights:
            mix = np.clip(w * p_l + (1 - w) * p_r, 0, 100)
            mse = mean_squared_error(y_va, mix)
            if mse < best_local[1]:
                best_local = (w, mse)

        if best_local[1] < best["mse"]:
            best.update({
                "alpha": float(alpha),
                "w": float(best_local[0]),
                "mse": float(best_local[1]),
                "mse_lr": float(mean_squared_error(y_va, p_l)),
                "mse_ridge": float(mean_squared_error(y_va, p_r))
            })

        print(f"a={alpha:.5g} | w*={w_opt:.4f} | best_w={best_local[0]:.4f} | mse={best_local[1]:.6f}")

    except Exception as err:
        print("Failed:", alpha, "|", err)


print("\nBEST:", best)


if best["mse"] < 1e8:
    a = best["alpha"]
    w = best["w"]

    X_full = pd.concat([X_tr, X_va])
    y_full = pd.concat([y_tr, y_va])

    Xt_full = feature_pipe.fit_transform(X_full, y_full)
    Xt_test = feature_pipe.transform(test[all_cols]) if test is not None else None

    modelL = LinearRegression().fit(Xt_full, y_full)
    modelR = Ridge(alpha=a).fit(Xt_full, y_full)

    if Xt_test is not None:
        pL = np.clip(modelL.predict(Xt_test), 0, 100)
        pR = np.clip(modelR.predict(Xt_test), 0, 100)
        final = np.clip(w * pL + (1 - w) * pR, 0, 100)

        out = pd.DataFrame({
            "ID": test["ID"] if "ID" in test else np.arange(len(final)),
            "RiskScore": final
        })
        out.to_csv(OUT_FILE, index=False)
        print("Saved:", OUT_FILE)


Shapes: (8217, 2000) (2055, 2000)
a=4.3288 | w*=0.0000 | best_w=0.0000 | mse=29.166091
a=5.4496 | w*=0.0000 | best_w=0.0000 | mse=28.959012
a=6.8606 | w*=0.0000 | best_w=0.0000 | mse=28.769466
a=8.637 | w*=0.0000 | best_w=0.0000 | mse=28.596657
a=10.873 | w*=0.0000 | best_w=0.0000 | mse=28.439481
a=13.689 | w*=0.0000 | best_w=0.0000 | mse=28.296798
a=17.233 | w*=0.0000 | best_w=0.0000 | mse=28.167643
a=21.695 | w*=0.0000 | best_w=0.0000 | mse=28.051372
a=27.313 | w*=0.0000 | best_w=0.0000 | mse=27.947750
a=34.385 | w*=0.0000 | best_w=0.0000 | mse=27.857071
a=43.288 | w*=0.0016 | best_w=0.0020 | mse=27.780251
a=54.496 | w*=0.0067 | best_w=0.0070 | mse=27.718386
a=68.606 | w*=0.0123 | best_w=0.0120 | mse=27.673817
a=86.37 | w*=0.0188 | best_w=0.0190 | mse=27.650065
a=108.73 | w*=0.0263 | best_w=0.0260 | mse=27.651689
a=136.89 | w*=0.0350 | best_w=0.0350 | mse=27.684156
a=172.33 | w*=0.0452 | best_w=0.0450 | mse=27.753584
a=216.95 | w*=0.0573 | best_w=0.0573 | mse=27.866292
a=273.13 | w*=