# 1. Package preparation

In [31]:
# ! pip install missingno
# ! pip install geopy
# ! pip install geopandas
# ! pip install geoplot   # need conda install -c conda-forge cartopy 
# ! pip install shapely
import numpy as np
import pandas as pd
from matplotlib.pylab import plt
import missingno as msno 
import warnings # ignore the warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
warnings.filterwarnings('ignore')

In [32]:
UShousing2 = pd.read_csv('UShousing2.csv')
UShousing2.shape

(6383, 12)

In [33]:
UShousing3 = UShousing2.copy()
X = UShousing3.drop('rent_amount',axis=1)
y = UShousing2['rent_amount']
X.shape, y.shape

((6383, 11), (6383,))

# 2. Feature Engineering

a. Convert categorical features to numerics
b. Standardize Numerical features

In [34]:
UShousing2 = UShousing2[UShousing2['rent_amount'] > 0]
xreg = pd.get_dummies(data=X[X.columns.difference(["identity"])])

# 3. Gradient Boosting Model Implement

##### Split Training set & Testing set

In [35]:
X_train, X_test, y_train, y_test = train_test_split(xreg, y, test_size=0.3, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4468, 2357), (1915, 2357), (4468,), (1915,))

In [36]:
X_train.shape

(4468, 2357)

In [37]:
numerics = ['int64','float64']

train_num = X_train.select_dtypes(include=numerics)
test_num =  X_test[train_num.columns]


scaler = StandardScaler()
x_train_scaled = pd.DataFrame(scaler.fit_transform(train_num),columns = train_num.columns)
x_test_scaled = pd.DataFrame(scaler.transform(test_num), columns = test_num.columns)

x_train_scaled.shape,x_test_scaled.shape
x_train_scaled.head()

Unnamed: 0,bathroom_count,bedroom_count,rent_duration,year
0,-0.63832,0.05788,-0.013735,-1.306077
1,0.701282,0.772273,-0.013735,1.258437
2,-0.63832,-0.656512,-0.013735,1.258437
3,-0.63832,-0.656512,-0.013735,-1.306077
4,1.371084,0.05788,-0.013735,-0.02382


In [38]:
X_train.loc[:,'bathroom_count'] = x_train_scaled.loc[:,'bathroom_count'].values
X_train.loc[:,'bedroom_count'] = x_train_scaled.loc[:,'bedroom_count'].values
X_train.loc[:,'rent_duration'] = x_train_scaled.loc[:,'rent_duration'].values
X_test.loc[:,'bathroom_count'] = x_test_scaled.loc[:,'bathroom_count'].values
X_test.loc[:,'bedroom_count'] = x_test_scaled.loc[:,'bedroom_count'].values
X_test.loc[:,'rent_duration'] = x_test_scaled.loc[:,'rent_duration'].values

In [39]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor(
    max_depth=10000,
    n_estimators=5,
    learning_rate=0.1
)
regressor.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=10000,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=5,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
errors = [mean_squared_error(y_test, y_pred) for y_pred in regressor.staged_predict(X_test)] 

best_n_estimators = np.argmin(errors)


In [41]:
best_regressor = GradientBoostingRegressor(
    max_depth=10000,
    n_estimators=best_n_estimators,
    learning_rate=0.1
)
best_regfit = best_regressor.fit(X_train, y_train)

In [42]:
y_pred = best_regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

875435.7816481739

In [43]:
import sklearn.metrics as metrics
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))

MAE: 637.9010885997321
MSE: 875435.7816481739
