In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [231]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [232]:
!wget $data

--2023-10-23 23:58:04--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: 'housing.csv.1'

     0K .......... .......... .......... .......... ..........  3% 1.34M 1s
    50K .......... .......... .......... .......... ..........  7% 1.67M 1s
   100K .......... .......... .......... .......... .......... 10% 5.85M 1s
   150K .......... .......... .......... .......... .......... 14% 3.20M 1s
   200K .......... .......... .......... .......... .......... 17% 3.60M 0s
   250K .......... .......... .......... .......... .......... 21% 2.87M 0s
   300K .......... .......... .......... .......... .......... 25% 7.05M 0s
   350K ......

In [7]:
df = pd.read_csv('housing.csv')

In [8]:
df.columns = df.columns.str.lower().str.replace(' ','_')
strings = list(df.dtypes[df.dtypes == 'object'].index)
df['ocean_proximity'] = df['ocean_proximity'].str.lower().str.replace(' ','_')

In [9]:
data = df[(df['ocean_proximity'] == 'inland') | (df['ocean_proximity'] == '<1h_ocean')]

In [10]:
data = data.reset_index(drop = True)

In [11]:
data.total_bedrooms.isnull().sum()
data['total_bedrooms'] = data.total_bedrooms.fillna(0)

In [12]:
data['median_house_value'] = np.log1p(df.median_house_value)

In [13]:
data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=1)

data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

y_train = data_train.median_house_value.values
y_val = data_val.median_house_value.values
y_test = data_test.median_house_value.values

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

In [14]:
train_dicts = data_train.to_dict(orient = 'records')
val_dicts = data_val.to_dict(orient = 'records')

dv = DictVectorizer(sparse = True)
x_train = dv.fit_transform(train_dicts)

x_val = dv.transform(val_dicts)

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(x_train, y_train)

In [15]:
y_pred = dt.predict(x_train)

In [16]:
from sklearn.tree import export_text

In [17]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- latitude <= 34.05
|   |--- value: [12.24]
|--- latitude >  34.05
|   |--- value: [11.96]



In [18]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)

In [19]:
y_pred = rf.predict(x_val)

In [20]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

In [21]:
for n in range(10,201,10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(x_train, y_train)
    
    y_pred = rf.predict(x_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print('%s,  %.3f' %( n, rmse))

10,  0.375
20,  0.367
30,  0.365
40,  0.362
50,  0.363
60,  0.363
70,  0.363
80,  0.362
90,  0.361
100,  0.361
110,  0.361
120,  0.361
130,  0.361
140,  0.362
150,  0.362
160,  0.362
170,  0.362
180,  0.361
190,  0.361
200,  0.361


In [22]:

for d in [10,15,20,25]:
    mean = []
    for n in range(10,201,10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(x_train, y_train)
    
        y_pred = rf.predict(x_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mean.append(rmse)
    mean_rmse = sum(mean)/len(mean)
    print('depth%s,  %.3f' %( d, mean_rmse))


depth10,  0.376
depth15,  0.362
depth20,  0.363
depth25,  0.363


In [23]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)

In [24]:
n = zip(data.columns, rf.feature_importances_)

In [25]:
for i,v in n:
 print('Feature: %s, Score: %.5f' % (i,v))

Feature: longitude, Score: 0.03269
Feature: latitude, Score: 0.05922
Feature: housing_median_age, Score: 0.35374
Feature: total_rooms, Score: 0.30847
Feature: total_bedrooms, Score: 0.09230
Feature: population, Score: 0.03407
Feature: households, Score: 0.00434
Feature: median_income, Score: 0.04456
Feature: median_house_value, Score: 0.03257
Feature: ocean_proximity, Score: 0.03803


In [26]:
features = dv.get_feature_names_out()
feature_names = list(features)
feature_names = [name.replace('[', '_').replace(']', '_').replace('<', '_').replace('>', '_') for name in feature_names]
feature_names
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names)
dval = xgb.DMatrix(x_val, label=y_val, feature_names=feature_names)

In [27]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [31]:
%%capture output

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

In [32]:
s = output.stdout
print(s)

[0]	train-rmse:0.49853	val-rmse:0.50223
[5]	train-rmse:0.40381	val-rmse:0.42145
[10]	train-rmse:0.37358	val-rmse:0.40265
[15]	train-rmse:0.35665	val-rmse:0.39826
[20]	train-rmse:0.34217	val-rmse:0.39450
[25]	train-rmse:0.33208	val-rmse:0.39455
[30]	train-rmse:0.32231	val-rmse:0.39439
[35]	train-rmse:0.31186	val-rmse:0.39359
[40]	train-rmse:0.30599	val-rmse:0.39571
[45]	train-rmse:0.29870	val-rmse:0.39431
[50]	train-rmse:0.29095	val-rmse:0.39347
[55]	train-rmse:0.28477	val-rmse:0.39303
[60]	train-rmse:0.27834	val-rmse:0.39347
[65]	train-rmse:0.27055	val-rmse:0.39222
[70]	train-rmse:0.26387	val-rmse:0.39180
[75]	train-rmse:0.25685	val-rmse:0.39130
[80]	train-rmse:0.24961	val-rmse:0.39142
[85]	train-rmse:0.24449	val-rmse:0.39098
[90]	train-rmse:0.23810	val-rmse:0.39176
[95]	train-rmse:0.23189	val-rmse:0.39154
[99]	train-rmse:0.22779	val-rmse:0.39167



In [33]:
y_pred = model.predict(dval)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

0.3916687158994265