In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

# local helpers
import sys
sys.path.append(os.path.abspath(".."))
from src.modeling import (
    load_features, infer_feature_types, time_aware_split,
    try_baselines, residual_plots, linear_coeff_table
)

load_dotenv()

# Load features produced in Stage 09
df = load_features()  # reads FEATURES_DATA_CSV from .env

print(df.shape)
df.head()


(17719, 25)


Unnamed: 0,ID number,Personnel Number,Basic Salary,Payment,AFFORDABILITY,LoanId,Regions,Gender,Age,Tenure,...,Instalment,PrincipalBalance,LoanStatus,InterestRate,LoanPurpose,RetirementDate,IsNPL,DebtToIncome,PrincipalPaidPct,HighSalaryFlag
0,ID_1,50403,7756.305,7910.8,3940.0,LN20242200000143311,Venus,Male,34,6.0,...,1713.56,7704.47,Active,33.0,Home improvement,2038-07-02,Not NPL,0.993317,0.0,1
1,ID_2,50422,7756.305,3657.8,0.0,LN20240240000119983,Gilmore,Female,46,60.0,...,2574.1,44656.34,Active,38.0,Education,2034-10-26,Not NPL,6.596029,0.127138,1
2,ID_3,5,7756.305,9580.26,6456.11575,LN20242270000144475,Venus,Female,41,60.0,...,1864.91,266.13,Settled,38.0,Debt consolidation,2059-08-24,Not NPL,4.647012,0.992616,1
3,ID_5,27,7756.305,9580.26,6456.11575,LN20232570000106355,Venus,Female,36,24.0,...,2113.57,3219.25,Active,35.0,Personal Use,2030-08-28,Not NPL,3.700892,0.887852,1
4,ID_6,29,7756.305,9580.26,2294.062,LN20243620000161297,Woodlawn,Male,32,60.0,...,3505.83,67360.79,Active,38.0,Building/Purchase home,2042-07-17,Not NPL,9.126391,0.048403,1


In [2]:
import os
print("CWD:", os.getcwd())


CWD: C:\Users\qochi\bootcamp_millicent_qochiwa\project\notebooks


In [3]:
TARGET = "AFFORDABILITY"
df = df[df[TARGET].notna()].copy()
print("Rows after dropping missing target:", len(df))


Rows after dropping missing target: 17718


In [4]:
print(df["AFFORDABILITY"].dtype)
print(df["AFFORDABILITY"].unique()[:20])  # first 20 unique values


float64
[3940.         0.      6456.11575 2294.062   4965.988   4430.58
  914.658   5460.932     93.262   4101.862   2581.272   4278.348
 1232.64    1611.662   4842.358   6265.958   3031.748    200.456
 4697.078   1625.65   ]


In [5]:
train_df, test_df = time_aware_split(df, date_col="LoanDate", test_size=0.2)

num_cols, cat_cols, dropped = infer_feature_types(train_df, target=TARGET)
print("Numeric:", num_cols[:10], "...")
print("Categorical:", cat_cols[:10], "...")
print("Dropped by name pattern:", dropped)

X_train = train_df[num_cols + cat_cols]
y_train = train_df[TARGET]
X_test  = test_df[num_cols + cat_cols]
y_test  = test_df[TARGET]


Numeric: ['Basic Salary', 'Payment', 'Age', 'Tenure', 'LoanAmount', 'DisbursementAmount', 'Instalment', 'PrincipalBalance', 'InterestRate', 'DebtToIncome'] ...
Categorical: ['Regions', 'Gender', 'LoanStatus', 'LoanPurpose', 'IsNPL'] ...
Dropped by name pattern: ['ID number', 'Personnel Number', 'LoanId', 'LoanDate', 'DisbursementDate', 'LastPaymentDate', 'RetirementDate', 'PrincipalPaidPct']


In [6]:
metrics_df, fitted = try_baselines(X_train, y_train, X_test, y_test, num_cols, cat_cols)
metrics_df


Unnamed: 0,model,MAE,RMSE,R2
3,RandomForest,108.479435,315.62818,0.971442
0,Linear,172.26559,360.657134,0.962712
1,RidgeCV,172.568904,360.769671,0.962689
2,LassoCV,171.466669,361.057903,0.962629


In [9]:
# # Choose the best (lowest RMSE) or a linear model for interpretability
# best_name = metrics_df.iloc[0]["model"]
# pipe, yhat = fitted[best_name]
# print("Chosen model:", best_name)

# # Diagnostics
# residual_plots(y_test.values, yhat, title_prefix=best_name)

# # Coefficients if linear-type
# if best_name in ("Linear", "RidgeCV", "LassoCV"):
#     coef_df = linear_coeff_table(pipe)
#     coef_df.head(25)


In [8]:
# Save predictions for inspection
pred_out = test_df[["LoanId"]].copy() if "LoanId" in test_df.columns else test_df.iloc[:, :1].copy()
pred_out["y_true"] = y_test.values
pred_out["y_pred"] = yhat
os.makedirs("../data/processed", exist_ok=True)
pred_path = "../data/processed/stage10a_predictions.csv"
pred_out.to_csv(pred_path, index=False)
pred_path


'../data/processed/stage10a_predictions.csv'