In [1]:
import pandas as pd
import os

df = pd.read_csv('data/train.csv')

In [2]:
from utils.index import prep_house_price_train_data
from sklearn.model_selection import train_test_split

X_final, y_scaled, train_constant = prep_house_price_train_data(df)

X_train, X_val, y_train, y_val = train_test_split(X_final, y_scaled, test_size=0.2, random_state=42)

Detected 15 skewed columns to fix.

FIXED [LotFrontage]: Positive Skew (2.20) -> Applied Log1p
FIXED [LotArea]: Positive Skew (12.21) -> Applied Log1p
FIXED [BsmtFinSF1]: Positive Skew (1.69) -> Applied Log1p
FIXED [BsmtFinSF2]: Positive Skew (4.26) -> Applied Log1p
FIXED [BsmtUnfSF]: Positive Skew (0.92) -> Applied Log1p
FIXED [TotalBsmtSF]: Positive Skew (1.52) -> Applied Log1p
FIXED [1stFlrSF]: Positive Skew (1.38) -> Applied Log1p
FIXED [2ndFlrSF]: Positive Skew (0.81) -> Applied Log1p
FIXED [GrLivArea]: Positive Skew (1.37) -> Applied Log1p
FIXED [WoodDeckSF]: Positive Skew (1.54) -> Applied Log1p
FIXED [OpenPorchSF]: Positive Skew (2.36) -> Applied Log1p
FIXED [EnclosedPorch]: Positive Skew (3.09) -> Applied Log1p
FIXED [3SsnPorch]: Positive Skew (10.30) -> Applied Log1p
FIXED [ScreenPorch]: Positive Skew (4.12) -> Applied Log1p
FIXED [PoolArea]: Positive Skew (14.83) -> Applied Log1p


In [3]:
import xgboost as xgb
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

from utils.index import train

en_model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)

xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
)

rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=10,
    min_samples_split=5,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
)

en_model = train(en_model, X_train, y_train, X_val, y_val, train_constant["y_info"])
rf_model = train(rf_model, X_train, y_train, X_val, y_val, train_constant["y_info"])
xgb_model = train(xgb_model, X_train, y_train, X_val, y_val, train_constant["y_info"])

--- ElasticNet Performance ---
R² Score: 0.8449
RMSE:     $34,488.50
MAE:      $19,611.41
RAE:      0.3168

--- RandomForestRegressor Performance ---
R² Score: 0.8296
RMSE:     $36,155.86
MAE:      $19,383.71
RAE:      0.3131

--- XGBRegressor Performance ---
R² Score: 0.9134
RMSE:     $25,770.64
MAE:      $15,279.00
RAE:      0.2468



In [4]:
from utils.index import prep_test_data, revert_label_data

test_df = pd.read_csv('data/test.csv')

id_col, X_test_final = prep_test_data(test_df, train_constant)

In [5]:
print(X_test_final.isna().sum()[X_test_final.isna().sum() > 0])

Series([], dtype: int64)


In [6]:
def export_submission(model, id_col, train_constant):    
    preds_scaled = model.predict(X_test_final)
    final_prices = revert_label_data(preds_scaled, train_constant["y_info"])

    submission = pd.DataFrame({
        'Id': id_col,
        'SalePrice': final_prices
    })


    model_name = type(model).__name__
    submission.to_csv(f'{model_name}_submission.csv', index=False)
    
export_submission(en_model, id_col, train_constant)
export_submission(rf_model, id_col, train_constant)
export_submission(xgb_model, id_col, train_constant)

  y_reverted = np.expm1(y_reverted)
