In [52]:
# HW4 is about machine learning algorthms and cleaning data
#--------
# notes to take on HW4:
# - pg.63: 3 ways to clean data and methods to do so
# - pg.63-64: how to replace missing all values to the median and
# back into the pandas dataframe
# - pg.64-65: scikit-learn design
# - pg.66: what/how is used to convert text to numbers
# - pg.66: why use fit_transform() for training, and transform() for testing
# - pg.67: what is one-hot encoding (whats a sparse matrix)
# - pg.68: how to create a custom transformers
# - pg.69: hyperparameter
# - pg.69: how to have attributes to have the same scale
# - pg.70: what is a pipeline good for?
# - pg.70: what exactly does the fit_transform() do?
# - general notes:
# - Since Scikit-Learn 0.20, the sklearn.preprocessing.Imputer class 
# was replaced by the sklearn.impute.SimpleImputer class.

In [64]:
import numpy as np
import urllib
import sklearn
import pandas as pd
import os
import tarfile
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [15]:
# getting data
DATA_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_URL = DATA_ROOT + "datasets/housing/housing.tgz"
HOUSING_PATH = os.path.join("datasets", "housing")

def fetch_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    os.makedirs(housing_path, exist_ok = True)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_data()

def load_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

housing = load_data()
housing.head()
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [16]:
# creating training and testing indexes (sets) using housing[income_cat]
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins = [0., 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [17]:
# dropping housing data set and copying the data
# would be used to ensure data is unaffected when adjusting and learning data
# basic: getting rid of housing and making housing_labels
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [20]:
housing_cat = housing['ocean_proximity']
housing_cat.head(10)

12655        INLAND
15502    NEAR OCEAN
2908         INLAND
14053    NEAR OCEAN
20496     <1H OCEAN
1481       NEAR BAY
18125     <1H OCEAN
5830      <1H OCEAN
17989     <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

In [21]:
# notice the difference using [[]] instead of []
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
1481,NEAR BAY
18125,<1H OCEAN
5830,<1H OCEAN
17989,<1H OCEAN
4861,<1H OCEAN


In [27]:
# converting text to numbers with ordinal encoder
ordinal_encoder = OrdinalEncoder()
# The fit method is calculating the mean and variance of 
# each of the features present in our data. The transform method is transforming 
# all the features using the respective mean and variance.
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[1.],
       [4.],
       [1.],
       [4.],
       [0.],
       [3.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [28]:
# to get a list of all the attributes above we can use categories_
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [30]:
# applying one-hot encoder to the attributes above
#----
# this creates a SciPy sparse matrix 
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [32]:
# if we want to convert the above into a dense numpy 2d matrix
housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [39]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'income_cat'],
      dtype='object')

In [41]:
# above we see the list that's in the housing data
# remember for below, lists begin at 0
# ----
# we could do this below, but it's not flexible. instead of hardcoding we could
# assign the variables to the columns
# room_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    # indexing housing.columns with col
    list(housing.columns).index(col)
    # for col function in this columns
    for col in ('total_rooms', 'total_bedrooms', 'population', 'households')
]

In [47]:
# custom transformer class
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # this is creating a hyperparameter,
    # used to allow us to determine if its helping the ML algorithm or not
    # Hyperparameters are important because they directly control the 
    # behaviour of the training algorithm
    # ---
    # it's used to gate any data you are not sure about
    # ---
    # The process of finding most optimal hyperparameters in machine learning
    # is called hyperparameter optimisation
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            # numpy.c_ = <numpy.lib.index_tricks.CClass object>
            # Translates slice objects to concatenation along the second axis.
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# attr_adder equals the class we made above
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# adding housing values transformed to attr_adder
housing_extra_attribs = attr_adder.transform(housing.values)

In [56]:
# creating a pipeline
num_pipeline = Pipeline([
    # 'median' chooses how to replace missing values along each column
    ('imputer', SimpleImputer(strategy='median')),
    
    ('addtribs_adder', CombinedAttributesAdder()),
    ('std_scalar', StandardScaler())
])

In [61]:
housing_num = housing.drop('ocean_proximity', axis=1)
imputer = SimpleImputer(strategy="median")
imputer.fit(housing_num)

In [63]:
# all but the last estimator must be transformers
# reason why it's fit_transform()
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [66]:
# using column transformer, which handles categorical and numerical columns
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)