In [1]:
import mlflow.xgboost
import mlflow
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from mlflow import lightgbm

In [2]:
final_features_regression = [

    # numeric 
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

    # categorical
    'NAME_CONTRACT_TYPE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE',

    # target
    'AMT_CREDIT'
]


In [3]:

df = pd.read_csv("../data/final/Regression_table.csv")


In [4]:
df.shape

(264909, 23)

In [5]:
df.sample(5)

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_REGISTRATION,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_POPULATION_RELATIVE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,Credit_to_Income_Ratio,...,avg_prev_amt_requested,prev_num_approved,NAME_CONTRACT_TYPE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,AMT_CREDIT
252085,225000.0,-13160,-994.0,4.0,3,0.0105,0.505998,0.519481,0.528093,2.4224,...,109746.0,3,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Unknown,Business Entity Type 2,545040.0
231093,135000.0,-14316,-491.0,3.0,2,0.00702,0.505998,0.610504,0.553165,5.99,...,44210.25,2,Cash loans,Commercial associate,Higher education,Married,House / apartment,Sales staff,Self-employed,808650.0
264163,360000.0,-14481,-2372.0,2.0,3,0.020713,0.505998,0.694705,0.307737,3.593062,...,396887.5,8,Cash loans,Working,Incomplete higher,Civil marriage,House / apartment,Managers,Construction,1293502.5
73316,76500.0,-13563,-7691.0,4.0,2,0.018634,0.214877,0.538413,0.750375,0.817706,...,35339.4,4,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Unknown,Business Entity Type 3,62554.5
137588,49500.0,-23871,-15078.0,1.0,3,0.020713,0.505998,0.676567,0.723837,5.327,...,106805.4,0,Cash loans,Pensioner,Secondary / secondary special,Widow,House / apartment,Unknown,XNA,263686.5


In [6]:
df['DAYS_BIRTH'] = abs(df['DAYS_BIRTH']) / 365
df['DAYS_REGISTRATION'] = abs(df['DAYS_REGISTRATION']) / 365

In [7]:
df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']] = df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].clip(
    upper=df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].quantile(0.99),
    axis=1
)

Linear Regression

In [8]:
num_cols = [
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

]

cat_cols = [
    "NAME_CONTRACT_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "OCCUPATION_TYPE",
    "ORGANIZATION_TYPE",
]


In [None]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        drop=None,
        sparse_output=False
    ))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)


In [None]:
model1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


In [None]:
X = df[num_cols + cat_cols]
y = df["AMT_CREDIT"] 

In [None]:
y=np.log1p(y)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [15]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model1.fit(X_train, y_train)

    preds = model1.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model1,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)

2025/12/17 17:39:39 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/17 17:39:39 INFO mlflow.store.db.utils: Updating database tables
2025/12/17 17:39:39 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/17 17:39:39 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/17 17:39:39 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/17 17:39:39 INFO alembic.runtime.migration: Will assume non-transactional DDL.


MAE: 204589.6949766643
RMSE: 324117.5661327247
üèÉ View run charming-fawn-229 at: http://127.0.0.1:5000/#/experiments/1/runs/786fe622ca594c7b9c0763997ded7d00
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Linear Regression with PCA

In [16]:
model2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=15)),
    ("regressor", LinearRegression())
])


In [17]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model2.fit(X_train, y_train)

    preds = model2.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression_PCA")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model2,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)




MAE: 234612.91131174675
RMSE: 344981.17693491455
üèÉ View run rogue-moth-948 at: http://127.0.0.1:5000/#/experiments/1/runs/4e5be9e93e484539bae0129bb10a038e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


XGBRegressor

In [9]:
df_model = df[final_features_regression].copy()

for col in cat_cols:
    df_model[col] = df_model[col].fillna("Unknown")
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

In [10]:
X = df_model[num_cols + cat_cols]
y = df["AMT_CREDIT"] 
y=np.log1p(y)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [20]:
model3= XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.06,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    eval_metric="mae",
    random_state=42,
    n_jobs=-1)

In [21]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model3.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
)

    preds = model3.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "XGBRegressor")
    # Log XGBoost parameters
    for param_name, param_value in model3.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.xgboost.log_model(
        model3,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[0]	validation_0-mae:0.54952
[1]	validation_0-mae:0.51686
[2]	validation_0-mae:0.48616
[3]	validation_0-mae:0.45728
[4]	validation_0-mae:0.43007
[5]	validation_0-mae:0.40449
[6]	validation_0-mae:0.38046
[7]	validation_0-mae:0.36509
[8]	validation_0-mae:0.34936
[9]	validation_0-mae:0.32864
[10]	validation_0-mae:0.30919
[11]	validation_0-mae:0.29655
[12]	validation_0-mae:0.27901
[13]	validation_0-mae:0.26252
[14]	validation_0-mae:0.25214
[15]	validation_0-mae:0.24255
[16]	validation_0-mae:0.22826
[17]	validation_0-mae:0.21838
[18]	validation_0-mae:0.20931
[19]	validation_0-mae:0.19706
[20]	validation_0-mae:0.19389
[21]	validation_0-mae:0.18259
[22]	validation_0-mae:0.17196
[23]	validation_0-mae:0.16200
[24]	validation_0-mae:0.15663
[25]	validation_0-mae:0.14761
[26]	validation_0-mae:0.14221
[27]	validation_0-mae:0.14042
[28]	validation_0-mae:0.13241
[29]	validation_0-mae:0.12487
[30]	validation_0-mae:0.11780
[31]	validation_0-mae:0.11117
[32]	validation_0-mae:0.10496
[33]	validation_0-ma



MAE: 7496.542471002168
RMSE: 26640.564641427678
üèÉ View run dashing-rat-327 at: http://127.0.0.1:5000/#/experiments/1/runs/7cfb480a3b474870878bb4be3df11e32
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [22]:
print(np.min(y_val), np.max(y_val))
print(np.min(preds), np.max(preds))

10.714439990727769 15.214227685996272
10.707207 15.124169


LightGBM

In [19]:
model4 = LGBMRegressor(
    objective="regression",      
    metric="mae",
    learning_rate=0.06,
    num_leaves=128,
    max_depth=-1,
    n_estimators=800,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
)

In [20]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model4.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)]
)

    preds = model4.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LIGHTGBMRegressor")
    for param_name, param_value in model4.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    mlflow.lightgbm.log_model(
        model4,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2743
[LightGBM] [Info] Number of data points in the train set: 198681, number of used features: 22
[LightGBM] [Info] Start training from score 13.085030




MAE: 5934.77426060818
RMSE: 18441.845748889944
üèÉ View run blushing-snake-194 at: http://127.0.0.1:5000/#/experiments/1/runs/0b042f8ec6c24e32a86b9874ef11574f
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


GridSearchCV on LightGBM

In [11]:
def real_mae(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

mae_scorer = make_scorer(real_mae, greater_is_better=False)

In [12]:
lgb_model = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1
)

In [13]:
param_grid = {
    "learning_rate": [0.03, 0.05],
    "num_leaves": [31, 63],
    "max_depth": [-1, 6],
    "n_estimators": [500, 800],
    "min_child_samples": [20, 40],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

In [14]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=mae_scorer,
    cv=cv,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X, y) 

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2732
[LightGBM] [Info] Number of data points in the train set: 264909, number of used features: 22
[LightGBM] [Info] Start training from score 13.085680


0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_grid,"{'colsample_bytree': [0.8], 'learning_rate': [0.03, 0.05], 'max_depth': [-1, 6], 'min_child_samples': [20, 40], ...}"
,scoring,make_scorer(r...hod='predict')
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [16]:
best_model = grid_search.best_estimator_
para=best_model.get_params()
best_model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2732
[LightGBM] [Info] Number of data points in the train set: 264909, number of used features: 22
[LightGBM] [Info] Start training from score 13.085680


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [17]:
model5 = LGBMRegressor(
        **para
    )

In [18]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model5.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)]
)

    preds = model5.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LIGHTGBMRegressor_best")
    for param_name, param_value in model5.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    mlflow.lightgbm.log_model(
        model5,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


2025/12/17 18:09:07 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/17 18:09:07 INFO mlflow.store.db.utils: Updating database tables
2025/12/17 18:09:07 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/17 18:09:07 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/17 18:09:07 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/17 18:09:07 INFO alembic.runtime.migration: Will assume non-transactional DDL.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2743
[LightGBM] [Info] Number of data points in the train set: 198681, number of used features: 22
[LightGBM] [Info] Start training from score 13.085030




MAE: 6212.302837843538
RMSE: 18939.651184803264
üèÉ View run stately-slug-408 at: http://127.0.0.1:5000/#/experiments/1/runs/87c61d39c7ba4faa820b6ffc4a7b5d14
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Cross Validation

In [21]:
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = LGBMRegressor(
        **para
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mae"
    )

    y_pred = model.predict(X_val)
    fold_mae = mean_absolute_error(
        np.expm1(y_val), np.expm1(y_pred)
    )

    mae_scores.append(fold_mae)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007970 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2739
[LightGBM] [Info] Number of data points in the train set: 220757, number of used features: 22
[LightGBM] [Info] Start training from score 13.084887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2741
[LightGBM] [Info] Number of data points in the train set: 220757, number of used features: 22
[LightGBM] [Info] Start training from score 13.085869
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2740
[LightGBM] [Info] Number of data points in the train

In [22]:
for fold, score in enumerate(mae_scores):
    print(f"Fold {fold+1} MAE: {score:.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f}")
print(f"Standard Deviation of MAE: {np.std(mae_scores)}")

Fold 1 MAE: 6233.87
Fold 2 MAE: 6237.17
Fold 3 MAE: 6175.65
Fold 4 MAE: 6096.15
Fold 5 MAE: 6322.72
Fold 6 MAE: 6068.00
Average MAE: 6188.93
Standard Deviation of MAE: 87.23147708003332


In [23]:
kf = KFold(n_splits=6, shuffle=True, random_state=42)
mae_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]


    model4.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="mae"
    )

    y_pred = model4.predict(X_val)
    fold_mae = mean_absolute_error(
        np.expm1(y_val), np.expm1(y_pred)
    )

    mae_scores.append(fold_mae)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2739
[LightGBM] [Info] Number of data points in the train set: 220757, number of used features: 22
[LightGBM] [Info] Start training from score 13.084887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2741
[LightGBM] [Info] Number of data points in the train set: 220757, number of used features: 22
[LightGBM] [Info] Start training from score 13.085869
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2740
[LightGBM] [Info] Number of data points in the train set: 220757, number of used features: 22
[LightGBM] [Info] Star

In [24]:
for fold, score in enumerate(mae_scores):
    print(f"Fold {fold+1} MAE: {score:.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f}")
print(f"Standard Deviation of MAE: {np.std(mae_scores)}")

Fold 1 MAE: 5899.77
Fold 2 MAE: 5830.16
Fold 3 MAE: 5864.63
Fold 4 MAE: 5741.04
Fold 5 MAE: 5950.08
Fold 6 MAE: 5883.63
Average MAE: 5861.55
Standard Deviation of MAE: 64.94301951548468


In [27]:
# normal training
model4.fit(X_train, y_train)

preds = model4.predict(X_val)

mae_normal = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
print("Normal MAE:", mae_normal)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2744
[LightGBM] [Info] Number of data points in the train set: 220758, number of used features: 22
[LightGBM] [Info] Start training from score 13.086784
Normal MAE: 5883.633562598447


In [28]:
y_train_shuffled = y_train.sample(frac=1, random_state=42).reset_index(drop=True)

model4.fit(X_train, y_train_shuffled)

preds_shuffled = model4.predict(X_val)

mae_shuffled = np.mean(np.abs(np.expm1(preds_shuffled) - np.expm1(y_val)))
print("Shuffled-target MAE:", mae_shuffled)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2744
[LightGBM] [Info] Number of data points in the train set: 220758, number of used features: 22
[LightGBM] [Info] Start training from score 13.086784
Shuffled-target MAE: 311888.17923640774
