In [72]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("databases", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [73]:
def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()

In [74]:
import pandas as pd
import numpy as np

In [75]:
def load_housing_data(housing_path = HOUSING_PATH):
    fetch_housing_data()
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [76]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [77]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [78]:
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [79]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [80]:
import matplotlib.pyplot as plt
#housing.hist(bins = 50, figsize = (20, 15))
#plt.show()

# CREATING A TEST SET BY CREATING A FUNCTION
import numpy as np
def split_train_test(data, ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.3)
print(len(train_set), "train + ", len(test_set), "test")


In [81]:
#CREATING A TEST AND TRAIN SET USING SKLEARN's TRAIN_TEST_SPLIT INBUILT FUNCTION
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.3, random_state = 42)

In [82]:
#USING STRATIFIED TRAIN TEST SPLITTING TO GET ACTUAL DEISTRIBUTION OF DATA, INSTEAD OF A RANDOM ONE

#The following code createsan income category attribute by dividing the median income by 1.5 (to limit the number of incomecategories), and rounding up using ceil (to have discrete categories), and then merging all the categoriesgreater than 5 into category 5
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)
#housing.hist(bins = 50, figsize = (20, 15))
#plt.show()

In [83]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing["income_cat"].value_counts()/len(housing)

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

In [84]:
housing = strat_train_set.copy()

In [85]:
#housing.plot(kind = "scatter", x = "longitude", y= "latitude", alpha = 0.1)

In [86]:
#housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,s=housing["population"]/100,label="population",figsize=(10,7),c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True,)
#plt.legend()

In [87]:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)


median_house_value          1.000000
median_income               0.687873
rooms_per_household         0.144427
total_rooms                 0.136801
housing_median_age          0.116305
households                  0.066788
total_bedrooms              0.049523
population_per_household   -0.024005
population                 -0.024592
longitude                  -0.052345
latitude                   -0.137864
bedrooms_per_room          -0.262723
Name: median_house_value, dtype: float64

In [100]:
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"]

In [101]:
# DATA CLEANING BY SEPERATE IMPUTING AND LABEL ENCODING AND 1HOT LABEL ENCODING

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = "median")
housing_num = housing.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num)
#imputer.statistics_
#housing_num.median().values
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns = housing_num.columns)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded



array([0, 1, 0, ..., 1, 4, 0])

In [102]:
print(encoder.classes_)

['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']


In [103]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot.toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]])

In [104]:
from sklearn.preprocessing import FunctionTransformer

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [105]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
8577,-118.41,33.89,31,1428,320,677,331,7.2316,<1H OCEAN,4.3142,2.04532
5960,-117.83,34.11,29,2671,437,1484,445,4.9844,INLAND,6.00225,3.33483
4753,-118.33,34.04,31,1090,251,955,239,2.913,<1H OCEAN,4.56067,3.99582
15423,-117.26,33.2,13,3163,725,1675,629,2.8214,<1H OCEAN,5.02862,2.66296
9788,-120.51,35.91,39,768,162,264,118,5.3245,INLAND,6.50847,2.23729


In [106]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #print(X[self.attribute_names].values)
        return X[self.attribute_names].values

In [107]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

old_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

old_cat_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])



In [108]:
from sklearn.pipeline import FeatureUnion

old_full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", old_num_pipeline),
        ("cat_pipeline", old_cat_pipeline),
    ])

In [109]:
old_housing_prepared = old_full_pipeline.fit_transform(housing)
old_housing_prepared

array([[ 0.58073699, -0.81762236,  0.17952267, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.87028331, -0.7144537 ,  0.02047048, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62067442, -0.74728009,  0.17952267, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.61411781, -1.06616502, -0.37715998, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64064313, -0.90203308,  0.57715313, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.72051797, -0.80355391,  0.57715313, ...,  0.        ,
         0.        ,  0.        ]])

In [110]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(old_housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [114]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared=old_full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

Predictions:  [374178.99111712 201980.40760298 176111.53526793 169535.38547485
 267767.44978406]
Labels:  [500001.0, 203000.0, 192500.0, 121900.0, 250000.0]


In [115]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(old_housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68671.29479037311

In [116]:
from sklearn.tree import DecisionTreeRegressor

tree_reg= DecisionTreeRegressor()
tree_reg.fit(old_housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [119]:
housing_predictions = tree_reg.predict(old_housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("Predictions: ", tree_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

Predictions:  [500001. 203000. 192500. 121900. 250000.]
Labels:  [500001.0, 203000.0, 192500.0, 121900.0, 250000.0]


In [122]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, old_housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)
lin_reg_scores = cross_val_score(lin_reg, old_housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_reg_scores)

In [125]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())
    
display_scores(tree_rmse_scores)
print("********************************************")
display_scores(lin_rmse_scores)

Scores:  [68210.21613129 70135.53503599 71392.17438073 69439.30098903
 74197.48310895 73414.64154739 72854.84337719 69793.39010878
 71235.76531207 72606.674122  ]
Mean:  71328.00241134314
Standard Deviation:  1833.8934121683565
********************************************
Scores:  [65829.44152145 77056.74494539 64126.62164766 70328.71895431
 68303.09942861 70916.94402158 70104.24228447 65102.15137703
 69723.40940468 71281.95671343]
Mean:  69277.33302986124
Standard Deviation:  3549.835517575307
