In [70]:
import mlflow.xgboost
import mlflow
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from mlflow import lightgbm

In [33]:
final_features_regression = [

    # numeric 
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

    # categorical
    'NAME_CONTRACT_TYPE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE',

    # target
    'AMT_CREDIT'
]


In [34]:

df = pd.read_csv("../data/final/Regression_table2.csv")


In [35]:
df.shape

(82180, 23)

In [36]:
df.sample(5)

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_REGISTRATION,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_POPULATION_RELATIVE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,Credit_to_Income_Ratio,...,avg_prev_amt_requested,prev_num_approved,NAME_CONTRACT_TYPE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,AMT_CREDIT
39786,12.32386,-17530,-7257.0,2.0,3,0.007305,0.748082,0.505578,0.434733,4.4752,...,11.528996,6,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Sales staff,6,1006920.0
75968,12.32386,-18070,-4692.0,2.0,2,0.0228,0.869621,0.448555,0.387625,8.76032,...,12.425219,1,Cash loans,Working,Higher education,Married,House / apartment,Core staff,6,1971072.0
65759,12.506181,-10343,-151.0,3.0,2,0.010006,0.580868,0.52611,0.639708,4.25,...,11.776365,4,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Laborers,6,1147500.0
1989,10.896758,-14253,-2299.0,3.0,3,0.0105,0.360296,0.556986,0.643026,5.235,...,10.959387,5,Cash loans,Working,Secondary / secondary special,Married,House / apartment,6,Business Entity Type 3,282690.0
70537,11.356283,-17959,-8454.0,2.0,3,0.020246,0.552549,0.060219,0.221335,3.673684,...,11.311963,3,Cash loans,Working,Secondary / secondary special,Married,House / apartment,6,6,314100.0


In [37]:
df['DAYS_BIRTH'] = abs(df['DAYS_BIRTH']) / 365
df['DAYS_REGISTRATION'] = abs(df['DAYS_REGISTRATION']) / 365

In [38]:
df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']] = df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].clip(
    upper=df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].quantile(0.99),
    axis=1
)

Linear Regression

In [39]:
num_cols = [
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

]

cat_cols = [
    "NAME_CONTRACT_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "OCCUPATION_TYPE",
    "ORGANIZATION_TYPE",
]


In [40]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        drop=None,
        sparse_output=False
    ))
])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)


In [42]:
model1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


In [43]:
X = df[num_cols + cat_cols]
y = df["AMT_CREDIT"] 

In [44]:
y=np.log1p(y)

In [45]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [46]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model1.fit(X_train, y_train)

    preds = model1.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model1,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)



MAE: 115984.61667845571
RMSE: 214325.1116039254
üèÉ View run aged-boar-292 at: http://127.0.0.1:5000/#/experiments/1/runs/4cc2a30fcf7d4ee48adf5c1aa4aff36b
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Linear Regression with PCA

In [47]:
model2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=15)),
    ("regressor", LinearRegression())
])


In [48]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model2.fit(X_train, y_train)

    preds = model2.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression_PCA")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model2,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)




MAE: 130363.0084942165
RMSE: 200789.04419097494
üèÉ View run calm-wren-286 at: http://127.0.0.1:5000/#/experiments/1/runs/5d7fe00916ff4b8cb92d1fdf44d1b2ea
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


XGBRegressor

In [49]:
df_model = df[final_features_regression].copy()

for col in cat_cols:
    df_model[col] = df_model[col].fillna("Unknown")
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

In [50]:
X = df_model[num_cols + cat_cols]
y = df["AMT_CREDIT"] 
y=np.log1p(y)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [58]:
model3= XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.06,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    eval_metric="mae",
    random_state=42,
    n_jobs=-1)

In [59]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model3.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
)

    preds = model3.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "XGBRegressor")
    # Log XGBoost parameters
    for param_name, param_value in model3.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.xgboost.log_model(
        model3,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[0]	validation_0-mae:0.54979
[1]	validation_0-mae:0.51726
[2]	validation_0-mae:0.48666
[3]	validation_0-mae:0.45788
[4]	validation_0-mae:0.43084
[5]	validation_0-mae:0.40541
[6]	validation_0-mae:0.38147
[7]	validation_0-mae:0.36580
[8]	validation_0-mae:0.35031
[9]	validation_0-mae:0.32965
[10]	validation_0-mae:0.31027
[11]	validation_0-mae:0.29717
[12]	validation_0-mae:0.27975
[13]	validation_0-mae:0.26332
[14]	validation_0-mae:0.25262
[15]	validation_0-mae:0.24279
[16]	validation_0-mae:0.22861
[17]	validation_0-mae:0.21886
[18]	validation_0-mae:0.20984
[19]	validation_0-mae:0.19761
[20]	validation_0-mae:0.19429
[21]	validation_0-mae:0.18310
[22]	validation_0-mae:0.17255
[23]	validation_0-mae:0.16264
[24]	validation_0-mae:0.15714
[25]	validation_0-mae:0.14817
[26]	validation_0-mae:0.14274
[27]	validation_0-mae:0.14085
[28]	validation_0-mae:0.13290
[29]	validation_0-mae:0.12546
[30]	validation_0-mae:0.11846
[31]	validation_0-mae:0.11191
[32]	validation_0-mae:0.10574
[33]	validation_0-ma



MAE: 9813.352277964534
RMSE: 37146.14842554588
üèÉ View run sincere-bat-984 at: http://127.0.0.1:5000/#/experiments/1/runs/dfef3a8ca8ac41c0b154ad775e131537
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [60]:
print(np.min(y_val), np.max(y_val))
print(np.min(preds), np.max(preds))

10.714439990727769 15.214227685996272
10.758946 14.859621


LightGBM

In [68]:
model4 = LGBMRegressor(
    objective="regression",      
    metric="mae",
    learning_rate=0.06,
    num_leaves=128,
    max_depth=-1,
    n_estimators=800,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
)

In [69]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model4.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)]
)

    preds = model4.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LIGHTGBMRegressor")
    for param_name, param_value in model4.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    mlflow.lightgbm.log_model(
        model4,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2596
[LightGBM] [Info] Number of data points in the train set: 61635, number of used features: 22
[LightGBM] [Info] Start training from score 13.117135




MAE: 9544.631443930439
RMSE: 36970.71975955329
üèÉ View run enthused-snipe-314 at: http://127.0.0.1:5000/#/experiments/1/runs/c3c067bfd2bf49cf9e3e1de599a7eca6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


GridSearchCV on LightGBM

In [71]:
def real_mae(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

mae_scorer = make_scorer(real_mae, greater_is_better=False)

In [72]:
lgb_model = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1
)

In [73]:
param_grid = {
    "learning_rate": [0.03, 0.05],
    "num_leaves": [31, 63],
    "max_depth": [-1, 6],
    "n_estimators": [500, 800],
    "min_child_samples": [20, 40],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

In [74]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [75]:
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=mae_scorer,
    cv=cv,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X, y) 

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2599
[LightGBM] [Info] Number of data points in the train set: 82180, number of used features: 22
[LightGBM] [Info] Start training from score 13.116683


0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_grid,"{'colsample_bytree': [0.8], 'learning_rate': [0.03, 0.05], 'max_depth': [-1, 6], 'min_child_samples': [20, 40], ...}"
,scoring,make_scorer(r...hod='predict')
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [78]:
best_model = grid_search.best_estimator_
para=best_model.get_params()
best_model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2599
[LightGBM] [Info] Number of data points in the train set: 82180, number of used features: 22
[LightGBM] [Info] Start training from score 13.116683


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [87]:
model5 = LGBMRegressor(
        **para
    )

In [88]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model5.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)]
)

    preds = model5.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LIGHTGBMRegressor_best")
    for param_name, param_value in model5.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    mlflow.lightgbm.log_model(
        model5,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2596
[LightGBM] [Info] Number of data points in the train set: 61635, number of used features: 22
[LightGBM] [Info] Start training from score 13.117135




MAE: 9241.798908692803
RMSE: 36288.22384122572
üèÉ View run beautiful-sloth-870 at: http://127.0.0.1:5000/#/experiments/1/runs/15e3a965bdaa4abeb41b1d0397e05f85
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Cross Validation

In [94]:
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = LGBMRegressor(
        **para
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mae"
    )

    y_pred = model.predict(X_val)
    fold_mae = mean_absolute_error(
        np.expm1(y_val), np.expm1(y_pred)
    )

    mae_scores.append(fold_mae)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2598
[LightGBM] [Info] Number of data points in the train set: 68483, number of used features: 22
[LightGBM] [Info] Start training from score 13.117201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2591
[LightGBM] [Info] Number of data points in the train set: 68483, number of used features: 22
[LightGBM] [Info] Start training from score 13.115967
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

In [96]:
for fold, score in enumerate(mae_scores):
    print(f"Fold {fold+1} MAE: {score:.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f}")
print(f"Standard Deviation of MAE: {np.std(mae_scores)}")

Fold 1 MAE: 8981.92
Fold 2 MAE: 9508.09
Fold 3 MAE: 8858.72
Fold 4 MAE: 9452.00
Fold 5 MAE: 8825.31
Fold 6 MAE: 8681.77
Average MAE: 9051.30
Standard Deviation of MAE: 315.8966538663996
