In [164]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [165]:
asdf = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [232]:
!wget $asdf

--2023-10-23 23:58:04--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: 'housing.csv.1'

     0K .......... .......... .......... .......... ..........  3% 1.34M 1s
    50K .......... .......... .......... .......... ..........  7% 1.67M 1s
   100K .......... .......... .......... .......... .......... 10% 5.85M 1s
   150K .......... .......... .......... .......... .......... 14% 3.20M 1s
   200K .......... .......... .......... .......... .......... 17% 3.60M 0s
   250K .......... .......... .......... .......... .......... 21% 2.87M 0s
   300K .......... .......... .......... .......... .......... 25% 7.05M 0s
   350K ......

In [280]:
df = pd.read_csv('housing.csv')

In [281]:
 df.columns = df.columns.str.lower().str.replace(' ','_')
# strings = list(df.dtypes[df.dtypes == 'object'].index)
# df['ocean_proximity'] = df['ocean_proximity'].str.lower().str.replace(' ','_')
# strings

In [286]:

data = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
data = data.fillna(0)

In [287]:
# data = data.reset_index(drop = True)

In [None]:

# data['total_bedrooms'] = data.total_bedrooms.fillna(0)
# data

In [None]:
data['median_house_value'] = np.log1p(df.median_house_value)

In [None]:
data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=1)

data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

y_train = data_train.median_house_value.values
y_val = data_val.median_house_value.values
y_test = data_test.median_house_value.values

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

In [None]:
train_dicts = data_train.to_dict(orient='records')
val_dicts = data_val.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
x_train = dv.fit_transform(train_dicts)

x_val = dv.transform(val_dicts)

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(x_train, y_train)

In [None]:
# y_pred = dt.predict(x_train)

In [None]:
from sklearn.tree import export_text

In [None]:
print(export_text(dt, feature_names=dv.feature_names_))

In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)

In [None]:
y_pred = rf.predict(x_val)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

In [None]:
for n in range(10,201,10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(x_train, y_train)
    
    y_pred = rf.predict(x_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print('%s,  %.3f' %( n, rmse))

In [None]:

for d in [10,15,20,25]:
    mean = []
    for n in range(10,201,10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(x_train, y_train)
    
        y_pred = rf.predict(x_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mean.append(rmse)
    mean_rmse = sum(mean)/len(mean)
    print('depth%s,  %.3f' %( d, mean_rmse))


In [None]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)

In [None]:
n = zip(data.columns, rf.feature_importances_)

In [None]:
for i,v in n:
 print('Feature: %s, Score: %.5f' % (i,v))

In [None]:
features = dv.get_feature_names_out()
feature_names = list(features)
feature_names = [name.replace('[', '_').replace(']', '_').replace('<', '_').replace('>', '_') for name in feature_names]
feature_names
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=feature_names)
dval = xgb.DMatrix(x_val, label=y_val, feature_names=feature_names)

In [None]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [None]:
%%capture output

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

In [None]:
s = output.stdout
print(s)

In [None]:
y_pred = model.predict(dval)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse