### This notebook includes tabular ONLY based models

In [7]:
import pandas as pd

train = pd.read_csv("../data/raw/train.csv")
test  = pd.read_csv("../data/raw/test.csv")

print(train.shape)


(16209, 21)


In [9]:
target = "price"

drop_cols = ["price", "date", "id"]
features = [col for col in train.columns if col not in drop_cols]

X = train[features]
y = train[target]




In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"Baseline RMSE: {rmse:.2f}")
print(f"Baseline R²  : {r2:.3f}")


Baseline RMSE: 191661.41
Baseline R²  : 0.707


In [13]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
nrmse_mean = rmse / np.mean(y_val)

r2 = r2_score(y_val, y_pred)

print(f"RMSE        : {rmse:.2f}")
print(f"NRMSE(mean) : {nrmse_mean:.4f}")
print(f"R²          : {r2:.3f}")


RMSE        : 191661.41
NRMSE(mean) : 0.3548
R²          : 0.707


In [15]:
X.isna().sum().sort_values(ascending=False).head()


bedrooms         0
bathrooms        0
sqft_living15    0
long             0
lat              0
dtype: int64

In [17]:
nrmse_range = rmse / (y_val.max() - y_val.min())

print(f"NRMSE(range): {nrmse_range:.4f}")


NRMSE(range): 0.0381


In [19]:
cv_rmse = rmse / np.mean(y_val)

print(f"CV-RMSE     : {cv_rmse:.4f}")



CV-RMSE     : 0.3548


# xgboost


In [3]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [13]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)


In [15]:
xgb_model.fit(X_train, y_train)


In [17]:
y_pred = xgb_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
nrmse = rmse / np.mean(y_val)
r2 = r2_score(y_val, y_pred)

print(f"XGBoost RMSE        : {rmse:.2f}")
print(f"XGBoost NRMSE(mean) : {nrmse:.4f}")
print(f"XGBoost R²          : {r2:.3f}")


XGBoost RMSE        : 118260.17
XGBoost NRMSE(mean) : 0.2189
XGBoost R²          : 0.889


In [19]:
import pandas as pd

importance = pd.DataFrame({
    "feature": X.columns,
    "importance": xgb_model.feature_importances_
}).sort_values(by="importance", ascending=False)

importance.head(10)


Unnamed: 0,feature,importance
8,grade,0.403019
5,waterfront,0.158878
2,sqft_living,0.105689
14,lat,0.064602
6,view,0.041456
1,bathrooms,0.040498
16,sqft_living15,0.036629
15,long,0.036297
9,sqft_above,0.019357
11,yr_built,0.018035


# random forest

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd


In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [27]:
rf_model = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)


In [29]:
rf_model.fit(X_train, y_train)


In [30]:
y_pred = rf_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
nrmse = rmse / np.mean(y_val)
r2 = r2_score(y_val, y_pred)

print(f"Random Forest RMSE        : {rmse:.2f}")
print(f"Random Forest NRMSE(mean) : {nrmse:.4f}")
print(f"Random Forest R²          : {r2:.3f}")


Random Forest RMSE        : 132513.17
Random Forest NRMSE(mean) : 0.2453
Random Forest R²          : 0.860


In [33]:
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_model.feature_importances_
}).sort_values(by="importance", ascending=False)

feature_importance.head(10)


Unnamed: 0,feature,importance
2,sqft_living,0.177462
8,grade,0.174554
14,lat,0.131154
9,sqft_above,0.097273
16,sqft_living15,0.091785
1,bathrooms,0.075076
15,long,0.050555
6,view,0.034757
11,yr_built,0.03172
13,zipcode,0.02787
