# Import Dependencies


In [1]:
%%capture
!pip install --upgrade scikit-learn

In [17]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import tree
from sklearn import ensemble
import math

# download dataset

In [3]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

# Load in the Dataset

In [5]:
path_to_data = './datasets/housing/housing.csv'
csv = pd.read_csv(path_to_data)

Split into GT and Features

In [6]:
# Group Median_House_value using pd.cut
cut_size = csv['median_house_value'].max() / 10
csv['split'] = pd.cut(csv['median_house_value'],
  bins = [i * cut_size for i in range(10)] + [np.inf],
  labels = [i for i in range(10)]

)

In [7]:
GT = csv['median_house_value']
del csv['median_house_value']
splitter = sklearn.model_selection.StratifiedShuffleSplit(n_splits = 1, test_size = 0.1, random_state = 42)
for train, test in splitter.split(csv, csv['split']):
  TEST_GT = GT[test].reset_index(drop = True)
  GT = GT[train].reset_index(drop = True)
  TEST_CSV = csv.iloc[test].reset_index(drop = True)
  csv =csv.iloc[train].reset_index(drop = True)

# Data Preprocessing Pipeline

In [8]:
# Choose Headers
std_headers = [column for column in csv.columns if column != 'ocean_proximity']
one_hot_headers = ['ocean_proximity']

In [9]:
# Encode the Headers, to index into NumPy Arrays 
header2idx = {}
idx2header = {}
for idx, header in enumerate(std_headers):
  header2idx[header] = idx
  idx2header[idx] = header

In [10]:
class DataFrameToNumPy(BaseEstimator, TransformerMixin):
  def __init__(self, headers):
    self.headers = headers
  def fit(self, *args):
    return self
  def transform(self, X):
    all_values = []
    for header in self.headers:
      all_values += [np.expand_dims(X[header], axis = 1)]
    features = np.concatenate(all_values, axis = 1)

    return features

In [11]:
class BedRoomsPerRoom(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.headers = ['bedrooms_per_room', 'bedrooms_per_household', 'rooms_per_household']
  def fit(self, *args):
    return self
  def transform(self, X, y = None):
    num_households = X[:, header2idx['households']]
    num_rooms = X[:, header2idx['total_rooms']]
    num_bedrooms = X[:, header2idx['total_bedrooms']]

    bedrooms_per_room = np.expand_dims(num_bedrooms / num_rooms, 1)
    bedrooms_per_household = np.expand_dims(num_bedrooms / num_households, 1)
    rooms_per_household = np.expand_dims(num_rooms / num_bedrooms, 1)
    
    return np.concatenate([X, bedrooms_per_room, bedrooms_per_household, rooms_per_household], axis = 1)

Transformer Path #1: main path

In [12]:
transformer_path1 = Pipeline([
    ('dataframe', DataFrameToNumPy(std_headers)),
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('bedrooms', BedRoomsPerRoom()),
    ('normalizer', sklearn.preprocessing.StandardScaler())
])

Transformer Path #2: One-Hot Encode



In [13]:
transformer_path2 = Pipeline([
    ('dataframe', DataFrameToNumPy(one_hot_headers)),
    ('one_hot', sklearn.preprocessing.OneHotEncoder())
])

# Create Pipelines

Combine the Two Paths: 

In [14]:
transformer_path = FeatureUnion(transformer_list = [
    ('features_path', transformer_path1),
    ('one_hot_path', transformer_path2)
])

In [15]:
# Transform
X = transformer_path.fit_transform(csv)
TEST_X = transformer_path.fit_transform(TEST_CSV)

# Create Splits
- Stratified Shuffle Split on Housing Prices Labels.


In [16]:
splitter = sklearn.model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
FOLDS = []
for train, test in splitter.split(np.zeros(X.shape[0]), GT):
  FOLDS += [((X[train], GT[train]), (X[test], GT[test]))]
cur_fold = 0
(train_x, train_y), (val_x, val_y) = FOLDS[cur_fold]



# Create the Model


Linear Regression 

In [80]:
model = sklearn.linear_model.LinearRegression()
model.fit(train_x, train_y)

LinearRegression()

In [81]:
predicted = model.predict(val_x)

In [83]:
error = math.sqrt(sklearn.metrics.mean_squared_error(val_y, predicted))
print(error)

14686.403562299074


Decision Trees(Overfit)

In [19]:
model = tree.DecisionTreeRegressor()
model.fit(train_x, train_y)

DecisionTreeRegressor()

In [86]:
predicted = model.predict(val_x)
error = math.sqrt(sklearn.metrics.mean_squared_error(val_y, predicted))
print(error)

0.0


Random Forest


In [20]:
model = sklearn.ensemble.RandomForestRegressor(n_estimators = 10)
model.fit(train_x, train_y)

RandomForestRegressor(n_estimators=10)

In [21]:
predicted = model.predict(val_x)
error = math.sqrt(sklearn.metrics.mean_squared_error(val_y, predicted))

# Fine tune results and Inference

In [40]:
grid = {
    'n_estimators': [100],
    'max_features': [17]
}
grid_search = sklearn.model_selection.GridSearchCV(
    sklearn.ensemble.RandomForestRegressor(),
    grid,
    cv = 5,
    scoring = 'neg_mean_squared_error',
    return_train_score = True
)

In [41]:
grid_search.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': [17], 'n_estimators': [100]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [42]:
grid_search.best_estimator_

RandomForestRegressor(max_features=17)

In [43]:
for score, params in zip(grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['params']) : print(np.sqrt(-score), params)

12751.779579886226 {'max_features': 17, 'n_estimators': 100}


In [51]:
# save and download the best model
import joblib
joblib.dump(grid_search.best_estimator_, 'model.pkl')

['model.pkl']