In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import joblib
import xgboost
from scipy.stats import zscore

### Preprocessing

In [None]:
train_df = pd.read_csv("./train_street_dataset.csv", index_col="Unnamed: 0")
train_df = train_df.rename(columns={f"{len(train_df.columns)-1}": "Price"})

val_df = pd.read_csv("./val_street_dataset.csv", index_col="Unnamed: 0")
val_df = val_df.rename(columns={f"{len(val_df.columns)-1}": "Price"})

In [None]:
model = XGBRegressor(
    n_estimators=20,
    learning_rate=0.01,
    reg_lambda=1
)

model_dt = DecisionTreeRegressor()

In [None]:
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

X_val = val_df.iloc[:, :-1]
y_val = val_df.iloc[:, -1]

#### Fitting Models

In [None]:
model.fit(X_train, y_train)
model_dt.fit(X_train, y_train)

### XGBoost Train Eval

In [None]:
y_train_pred = model.predict(X_train)

In [None]:
mse = mean_squared_error(y_train.values, y_train_pred)

0.20055699348449707

### XGBoost Eval

In [None]:
y_pred_xg = model.predict(X_val)
y_pred_dt = model_dt.predict(X_val)

In [None]:
mse = mean_squared_error(y_val.values, y_pred_xg)

In [None]:
np.sqrt(mse)

350155.17816314695

### Decision Tree Eval

In [None]:
mse = mean_squared_error(y_val.values, y_pred_dt)

In [None]:
np.sqrt(mse)

350155.17816314695

In [None]:
joblib.dump(model, "./xgboost_mask.joblib")

['./xgboost_mask.joblib']

In [None]:
sorted(y_val.unique())

[1100000,
 1110000,
 1150000,
 1210000,
 1267500,
 1320000,
 1350000,
 1360000,
 1390000,
 1400000,
 1416000,
 1425000,
 1450000,
 1460000,
 1475000,
 1541500,
 1600000,
 1650000,
 1660000,
 1726000,
 1750000,
 1760000,
 1765000,
 1770000,
 1825000,
 1900000,
 1950000,
 1975000,
 1995000,
 2000000,
 2063500,
 2099000,
 2100000,
 2200000,
 2275000,
 2350000,
 2355000]