In [32]:
import mlflow.xgboost
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder


In [33]:
final_features_regression = [

    # numeric 
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

    # categorical
    'NAME_CONTRACT_TYPE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE',

    # target
    'AMT_CREDIT'
]


In [34]:

df = pd.read_csv("../data/final/Regression_table2.csv")


In [35]:
df.shape

(82180, 23)

In [36]:
df.sample(5)

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_REGISTRATION,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_POPULATION_RELATIVE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,Credit_to_Income_Ratio,...,avg_prev_amt_requested,prev_num_approved,NAME_CONTRACT_TYPE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,AMT_CREDIT
39786,12.32386,-17530,-7257.0,2.0,3,0.007305,0.748082,0.505578,0.434733,4.4752,...,11.528996,6,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Sales staff,6,1006920.0
75968,12.32386,-18070,-4692.0,2.0,2,0.0228,0.869621,0.448555,0.387625,8.76032,...,12.425219,1,Cash loans,Working,Higher education,Married,House / apartment,Core staff,6,1971072.0
65759,12.506181,-10343,-151.0,3.0,2,0.010006,0.580868,0.52611,0.639708,4.25,...,11.776365,4,Cash loans,Working,Secondary / secondary special,Married,House / apartment,Laborers,6,1147500.0
1989,10.896758,-14253,-2299.0,3.0,3,0.0105,0.360296,0.556986,0.643026,5.235,...,10.959387,5,Cash loans,Working,Secondary / secondary special,Married,House / apartment,6,Business Entity Type 3,282690.0
70537,11.356283,-17959,-8454.0,2.0,3,0.020246,0.552549,0.060219,0.221335,3.673684,...,11.311963,3,Cash loans,Working,Secondary / secondary special,Married,House / apartment,6,6,314100.0


In [37]:
df['DAYS_BIRTH'] = abs(df['DAYS_BIRTH']) / 365
df['DAYS_REGISTRATION'] = abs(df['DAYS_REGISTRATION']) / 365

In [38]:
df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']] = df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].clip(
    upper=df[['Credit_to_Income_Ratio', 'Annuity_to_Income_Ratio']].quantile(0.99),
    axis=1
)

Linear Regression

In [39]:
num_cols = [
    'AMT_INCOME_TOTAL',
    'DAYS_BIRTH',
    'DAYS_REGISTRATION',
    'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT',
    'REGION_POPULATION_RELATIVE',

    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',

    'Credit_to_Income_Ratio',
    'Annuity_to_Income_Ratio',

    'pos_num_loans',
    'pos_mean_cnt_instalment',

    'avg_prev_amt_requested',
    'prev_num_approved',

]

cat_cols = [
    "NAME_CONTRACT_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "OCCUPATION_TYPE",
    "ORGANIZATION_TYPE",
]


In [40]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        drop=None,
        sparse_output=False
    ))
])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)


In [42]:
model1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


In [43]:
X = df[num_cols + cat_cols]
y = df["AMT_CREDIT"] 

In [44]:
y=np.log1p(y)

In [45]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [46]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model1.fit(X_train, y_train)

    preds = model1.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model1,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)



MAE: 115984.61667845571
RMSE: 214325.1116039254
üèÉ View run aged-boar-292 at: http://127.0.0.1:5000/#/experiments/1/runs/4cc2a30fcf7d4ee48adf5c1aa4aff36b
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


Linear Regression with PCA

In [47]:
model2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=15)),
    ("regressor", LinearRegression())
])


In [48]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    
    model2.fit(X_train, y_train)

    preds = model2.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "LinearRegression_PCA")
    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.sklearn.log_model(
        model2,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)




MAE: 130363.0084942165
RMSE: 200789.04419097494
üèÉ View run calm-wren-286 at: http://127.0.0.1:5000/#/experiments/1/runs/5d7fe00916ff4b8cb92d1fdf44d1b2ea
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


XGBRegressor

In [49]:
df_model = df[final_features_regression].copy()

for col in cat_cols:
    df_model[col] = df_model[col].fillna("Unknown")
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

In [50]:
X = df_model[num_cols + cat_cols]
y = df["AMT_CREDIT"] 
y=np.log1p(y)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [51]:
model3= XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    n_jobs=-1)

In [57]:
mlflow.set_experiment("loan_amount_regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
with mlflow.start_run():
    model3.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
)

    preds = model3.predict(X_val)

    mae = np.mean(np.abs(np.expm1(preds) - np.expm1(y_val)))
    rmse = np.sqrt(np.mean((np.expm1(preds) - np.expm1(y_val)) ** 2))
    mlflow.log_metric("RMSE", rmse)


    mlflow.log_metric("MAE", mae)
    mlflow.log_param("model_type", "XGBRegressor")
    # Log XGBoost parameters
    for param_name, param_value in model3.get_params().items():
        if param_value is not None:
            mlflow.log_param(param_name, param_value)

    mlflow.log_param("features_count", X.shape[1])

    # Save model
    mlflow.xgboost.log_model(
        model3,
        artifact_path="model"
    )

    print("MAE:", mae)
    print("RMSE:", rmse)


[0]	validation_0-rmse:0.67972
[1]	validation_0-rmse:0.64710
[2]	validation_0-rmse:0.61613
[3]	validation_0-rmse:0.58674
[4]	validation_0-rmse:0.55868
[5]	validation_0-rmse:0.53200
[6]	validation_0-rmse:0.50678
[7]	validation_0-rmse:0.49042
[8]	validation_0-rmse:0.47366
[9]	validation_0-rmse:0.45117
[10]	validation_0-rmse:0.42972
[11]	validation_0-rmse:0.41569
[12]	validation_0-rmse:0.39594
[13]	validation_0-rmse:0.37737
[14]	validation_0-rmse:0.36554
[15]	validation_0-rmse:0.35449
[16]	validation_0-rmse:0.33802
[17]	validation_0-rmse:0.32645
[18]	validation_0-rmse:0.31560
[19]	validation_0-rmse:0.30086
[20]	validation_0-rmse:0.29690
[21]	validation_0-rmse:0.28310
[22]	validation_0-rmse:0.27008
[23]	validation_0-rmse:0.25769
[24]	validation_0-rmse:0.25074
[25]	validation_0-rmse:0.23924
[26]	validation_0-rmse:0.23245
[27]	validation_0-rmse:0.22992
[28]	validation_0-rmse:0.21955
[29]	validation_0-rmse:0.20971
[30]	validation_0-rmse:0.20038
[31]	validation_0-rmse:0.19151
[32]	validation_0-



MAE: 10280.509464742032
RMSE: 36243.88929075488
üèÉ View run dazzling-bear-175 at: http://127.0.0.1:5000/#/experiments/1/runs/c4029a759d264cc996b346edfef11101
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
