In [9]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

In [10]:
DATA_PATH = "train.csv"
df = pd.read_csv(DATA_PATH)

In [11]:
from sklearn.model_selection import train_test_split

TOP_K = 8
DEGREE = 4
ALPHA = 1.1
ABS_CLIP_OUTLIERS = 1000

df = df.loc[df["RiskScore"].abs() <= ABS_CLIP_OUTLIERS].copy()

X = df.copy()

categorial_feature = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 'EducationLevel']
X_num = X.select_dtypes(include=[np.number]).drop(columns=["RiskScore"], errors="ignore")
X_categorial = X[categorial_feature].copy()
y = X["RiskScore"].astype(float).values

quantile_min = X_num.quantile(0.01)
quantile_max = X_num.quantile(0.99)
X_wins = X_num.clip(lower=quantile_min, upper=quantile_max, axis=1)

imp = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imp.fit_transform(X_wins), columns=X_wins.columns)


def split_date(df):
    s = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    return pd.DataFrame({
        'year': s.dt.year,
        'quarter': s.dt.quarter,
        'month': s.dt.month,
        'dow': s.dt.dayofweek
    }, index=df.index)


scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imp), columns=X_imp.columns)

corr = X_scaled.assign(RiskScore=y).corr()["RiskScore"].drop("RiskScore").abs().sort_values(ascending=False)
top_numeric = corr.head(TOP_K).index.tolist()
other_columns = [c for c in X_scaled.columns if c not in top_numeric]

poly = PolynomialFeatures(degree=DEGREE)
X_poly = pd.DataFrame(poly.fit_transform(X_scaled[top_numeric]), columns=poly.get_feature_names_out(top_numeric))

dates = split_date(df)

X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
X_final = pd.concat(
    [X_scaled[other_columns].reset_index(drop=True),
     X_poly.reset_index(drop=True),
     X_cat_dummies.reset_index(drop=True),
     dates.reset_index(drop=True)],
    axis=1)
print(f"Frame: {X_final.shape[1]} features (other columns: {len(other_columns)}, polynomial on {TOP_K} cols, {DEGREE} degree)")

X_train, X_validation, y_train, y_validation = train_test_split(X_final, y, train_size=0.8)
ridge = Ridge(alpha=ALPHA, random_state=42)
ridge.fit(X_train, y_train)
test_pred = ridge.predict(X_train.copy())
validation_pred = ridge.predict(X_validation.copy())


train_mse = mean_squared_error(y_train, test_pred)
train_rmse = train_mse ** 0.5
train_r2 = r2_score(y_train, test_pred)
train_mae = mean_absolute_error(y_train, test_pred)
train_mape = mean_absolute_percentage_error(y_train, test_pred)
validation_mse = mean_squared_error(y_validation, validation_pred)
validation_rmse = validation_mse ** 0.5
validation_r2 = r2_score(y_validation, validation_pred)
validation_mae = mean_absolute_error(y_validation, validation_pred)
validation_mape = mean_absolute_percentage_error(y_validation, validation_pred)

print(f"TRAIN MSE: {train_mse:.4f} | RMSE: {train_rmse:.4f} | R^2: {train_r2:.4f} | MAE: {train_mae:.4f} | MAPE: {train_mape:.4f}")
print(f"VALIDATION MSE: {validation_mse:.4f} | RMSE: {validation_rmse:.4f} | R^2: {validation_r2:.4f} | MAE: {validation_mae:.4f} | MAPE: {validation_mape:.4f}")


Design matrix: 535 features (base rest: 20, poly on 8 cols, degree 4)
TRAIN MSE: 22.6472 | RMSE: 4.7589 | R^2: 0.9237 | MAE: 3.5150 | MAPE: 0.0815
VALIDATION MSE: 29.4024 | RMSE: 5.4224 | R^2: 0.9011 | MAE: 3.7732 | MAPE: 0.0866


In [11]:
from typing import Any


def split_date(df):
    s = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    return pd.DataFrame({
        'year': s.dt.year,
        'quarter': s.dt.quarter,
        'month': s.dt.month,
        'dow': s.dt.dayofweek
    }, index=df.index)


categorial_feature = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 'EducationLevel']


def fit_model(train_path, abs_clip_outliers, top_k, degree, alpha) -> dict[str, Any]:
    df = pd.read_csv(train_path).copy()

    df = df[~df["RiskScore"].isna()].copy()
    df = df[df["RiskScore"].abs() <= abs_clip_outliers].copy()

    X_num = df.select_dtypes(include=[np.number]).drop(columns=["RiskScore"], errors="ignore")
    X_categorial = df[categorial_feature].copy()
    y = df["RiskScore"].astype(float).values

    quantile_min = X_num.quantile(0.01)
    quantile_max = X_num.quantile(0.99)
    X_wins = X_num.clip(lower=quantile_min, upper=quantile_max, axis=1)

    imp = SimpleImputer(strategy="median").fit(X_wins)
    X_imp = pd.DataFrame(imp.transform(X_wins), columns=X_wins.columns)

    scaler = StandardScaler().fit(X_imp)
    X_scaled = pd.DataFrame(scaler.transform(X_imp), columns=X_imp.columns)

    corr = X_scaled.assign(RiskScore=y).corr()["RiskScore"].drop("RiskScore").abs().sort_values(ascending=False)
    top_numeric = corr.head(top_k).index.tolist()
    other_columns = [c for c in X_scaled.columns if c not in top_numeric]

    poly = PolynomialFeatures(degree=degree).fit(X_scaled[top_numeric])
    X_poly = pd.DataFrame(poly.transform(X_scaled[top_numeric]),
                          columns=poly.get_feature_names_out(top_numeric))

    dates = split_date(df)
    X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
    X_final = pd.concat([
        X_scaled[other_columns].reset_index(drop=True),
        X_poly.reset_index(drop=True),
        X_cat_dummies.reset_index(drop=True),
        dates.reset_index(drop=True)],
        axis=1)
    model = Ridge(alpha=alpha, random_state=42).fit(X_final, y)

    model_fit_data = {
        "numeric_columns": X_num.columns.tolist(), "other_columns": other_columns, "top_numeric": top_numeric,
        "quantile_min": quantile_min, "quantile_max": quantile_max, "scaler": scaler, "imputer": imp, "poly": poly,
        "model": model
    }
    return model_fit_data

In [12]:
def predict_data(test_path, model_fit_data, out_path="predictions.csv"):
    df = pd.read_csv(test_path).copy()

    X_num = df.reindex(columns=model_fit_data["numeric_columns"])
    X_categorial = df[categorial_feature].copy()
    X_wins = X_num.clip(lower=model_fit_data["quantile_min"], upper=model_fit_data["quantile_max"], axis=1)
    X_imputer = pd.DataFrame(model_fit_data["imputer"].transform(X_wins), columns=model_fit_data["numeric_columns"])
    X_scaled = pd.DataFrame(model_fit_data["scaler"].transform(X_imputer), columns=model_fit_data["numeric_columns"])

    other_columns = model_fit_data["other_columns"]
    top_numeric = model_fit_data["top_numeric"]
    X_poly = pd.DataFrame(
        model_fit_data["poly"].transform(X_scaled[top_numeric]),
        columns=model_fit_data["poly"].get_feature_names_out(top_numeric)
    )

    dates = split_date(df)
    X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
    X_final = pd.concat([X_scaled[other_columns].reset_index(drop=True), X_poly.reset_index(drop=True),
                         X_cat_dummies.reset_index(drop=True), dates.reset_index(drop=True)], axis=1)

    prediction = model_fit_data["model"].predict(X_final)

    output = pd.DataFrame({"prediction": prediction})
    output.insert(0, "ID", df["ID"].values)
    output.to_csv(out_path, index=False)
    return output

In [13]:
model_fit_data = fit_model("train.csv", abs_clip_outliers=ABS_CLIP_OUTLIERS, top_k=8, degree=4, alpha=5.0)
sub = predict_data("test.csv", model_fit_data, out_path="predictions.csv")
print(sub.head())


   ID  prediction
0   0   33.176446
1   1   53.104048
2   2   29.658294
3   3   35.943292
4   4   33.248941
