In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np
plot_scatter_matrix = pd.plotting.scatter_matrix

In [2]:
data = pd.read_csv("housing.csv")
data["income_cat"] = pd.cut(data["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1,2,3,4,5])
#creating new attributes
households = data["households"]
data["rooms_per_household"] = data["total_rooms"]/households
data["bedrooms_per_room"] = data["total_bedrooms"]/data["total_rooms"]
data["population_per_household"] = data["population"] / data["households"]

In [3]:
# creating test and train data
split=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data,data["income_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [4]:
# correlation matrix
corr_matrix = data.corr()

In [5]:
#seperating predictors and labels 
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
imputer.statistics_
X=imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [7]:
# Encoding categorical(textual) data into numerical 
from sklearn.preprocessing import OneHotEncoder
category_encoder = OneHotEncoder();
housing_cat = housing[["ocean_proximity"]]
hot_encoded_cat = category_encoder.fit_transform(housing_cat)

In [8]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
from sklearn.base import BaseEstimator, TransformerMixin
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
    

In [9]:
# We can create a transformation pipeline for our data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

text_num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("attribs_adder",CombinedAttributesAdder()), 
    ("std_scaler",StandardScaler()),
])

In [10]:
housing_num_train = text_num_pipeline.fit_transform(housing_num)
housing_num_train.shape

(16512, 15)

In [11]:
# Apply all the transformation to the columns appropriately
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num",text_num_pipeline, num_attribs), 
    ("cat",OneHotEncoder(),cat_attribs),
])
housing_final = full_pipeline.fit_transform(housing)
print(housing.shape, housing_final.shape)

(16512, 13) (16512, 20)


In [12]:
# Implementing a linear regression model 
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_final, housing_labels)

In [13]:
# Testing few instances of data with out linear regression model 
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
housing_predictions = lin_reg.predict(some_data_prepared)

In [14]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(some_labels, housing_predictions))

45353.31097519148

In [15]:
# using a decision tree regressor
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_final, housing_labels)

tree_predictions = tree_reg.predict(some_data_prepared)
tree_predictions

array([ 72100., 279600.,  82700., 112500., 238300.])

In [16]:
tree_errrs = mean_squared_error(some_labels, tree_predictions)
tree_errrs

0.0

In [22]:
# We have to do model validation now to process how well our model reacts to new data
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,housing_final, housing_labels,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores = np.sqrt(-scores)
lin_scores = cross_val_score(lin_reg,housing_final,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [21]:
lin_rmse_scores

array([70862.11646446, 63728.50678666, 67364.62254497, 68293.04057929,
       65464.63607758, 71798.73143182, 69472.43071194, 68151.42605482,
       65399.78281896, 69733.34187153])

In [23]:
tree_rmse_scores

array([72916.24747229, 69555.69837672, 69042.8104286 , 72105.3573041 ,
       68892.3784985 , 76529.90097102, 70482.37951926, 74075.58798112,
       69594.24689852, 71736.22382815])

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    {"n_estimators":[3,10,30], "max_features":[2,4,6,8]}, 
    {"bootstrap":[False], "n_estimators":[3,10], "max_features":[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid, cv=5,scoring="neg_mean_squared_error",return_train_score=True)
grid_search.fit(housing_final, housing_labels)

In [40]:
grid_search.best_estimator_ 

In [38]:
res = grid_search.cv_results_
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(np.sqrt(-mean_score),params)

65818.86430106872 {'max_features': 2, 'n_estimators': 3}
57035.59695333669 {'max_features': 2, 'n_estimators': 10}
54420.71752147266 {'max_features': 2, 'n_estimators': 30}
62348.25822512363 {'max_features': 4, 'n_estimators': 3}
54894.156102410016 {'max_features': 4, 'n_estimators': 10}
52668.42176279951 {'max_features': 4, 'n_estimators': 30}
61187.226278082846 {'max_features': 6, 'n_estimators': 3}
54235.03010505819 {'max_features': 6, 'n_estimators': 10}
51955.543673437605 {'max_features': 6, 'n_estimators': 30}
59833.7341373026 {'max_features': 8, 'n_estimators': 3}
53395.77379274855 {'max_features': 8, 'n_estimators': 10}
51860.03823460632 {'max_features': 8, 'n_estimators': 30}
63722.588180068335 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
55976.95734775608 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
62753.77788889966 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
54824.63910813478 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [48]:
#Let's evaluate the data with the model that our grid search returned to us 
best_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value",axis=1)
Y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)

In [59]:
final_predictions = best_model.predict(X_test_prepared)
scores = mean_squared_error(final_predictions,Y_test)
scores = np.sqrt(scores)

In [61]:
scores

49243.9162471763