### We're gonna use the Ridge Regression Model model as the baseline for our problem

In [2]:
import numpy as np
import pandas as pd

# Test the data first
X_train = pd.read_csv('../data_csv/X_train_final.csv')
X_test = pd.read_csv('../data_csv/X_test_final.csv')
y_train = pd.read_csv('../data_csv/y_train_final.csv')
y_test = pd.read_csv('../data_csv/y_test_final.csv')

# Check the size
print(X_train.shape)
print(X_test.shape)

(227986, 51)
(56997, 51)


In [3]:
# Start to build the model
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha = 1.0) # Let the penalty a small number

print("Start training: Loading....")
ridge_model.fit(X_train,y_train)
print("Done!")

Start training: Loading....
Done!


In [4]:
# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict in X_test
y_pred_log = ridge_model.predict(X_test)

# Convert to normal curve instead of log
y_test_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred_log)

# Calculate
r2 = r2_score(y_test_original, y_pred_original)
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))

# Print out
print("--- Result of Ridge Regression ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")

--- Result of Ridge Regression ---
R-squared (R²): 0.4983
MAE: $6,595.26
RMSE: $10,318.08


In [5]:
import joblib
import os

os.makedirs('../models', exist_ok=True)
joblib.dump(ridge_model, '../models/ridge_model.pkl')

['../models/ridge_model.pkl']