In [31]:
import import_ipynb
from GettingData import load_housing_data, stratified_split_train_test

In [32]:
housing = load_housing_data()
housing, _ = stratified_split_train_test(housing)
housing_labels = housing.drop('median_house_value', axis=1)
# Let's add some attributes
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

In [33]:
# Removing missing features
option = -1
# Option 1: dropna(), removes districts that have designated missing attribute
if option==1:
    housing.dropna(subset=['total_bedrooms'])
# Option 2: drop(), dropts the whole attribute all together
if option==2:
    housing.drop('total_bedrooms', axis=1)
# Option 3: fillna(), fills any missing features in attribute with designated value, in this case, the median total_bedrooms
if option==3:
    median = housing['total_bedrooms'].median()
    housing['total_bedrooms'].fillna(median, inplace=True)

In [73]:
# Using Imputer to fill missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

# Median can only be calculated on numerical data, so lets make a copy of data without any text attributes
housing_numerical = housing.drop('ocean_proximity', axis=1)
#housing_numerical.drop('income_cat', axis=1)
imputer.fit(housing_numerical) # Just finds the median of all attributes

imputer.statistics_

array([-1.18510000e+02,  3.42600000e+01,  2.90000000e+01,  2.11950000e+03,
        4.33000000e+02,  1.16400000e+03,  4.08000000e+02,  3.54090000e+00,
        1.79500000e+05,  3.00000000e+00,  5.23228423e+00,  2.03031374e-01,
        2.81765270e+00])

In [74]:
import pandas as pd

x = imputer.transform(housing_numerical)

# x is a numpy array, so let's convert back to pd.DataFrame
housing_tr = pd.DataFrame(x, columns=housing_numerical.columns)

In [75]:
# Handling text and categorical attributes
housing_cat = housing['ocean_proximity']
housing_cat.head(10)

17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

In [76]:
# Let's convert to numerical with pd.DataFrame.factorize()
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
housing_categories

Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')

In [77]:
# Convert to one-hot vectors
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) 
# We use reshape because fit_transform() wants a 2D array, and we only have 1D

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [78]:
# Creating our own custom transformer to combine the attributes as we did earlier automatically
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [79]:
# Let's make a Pipleline to speed up the data traformation process
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attrib_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = numerical_pipeline.fit_transform(housing_tr)

In [80]:
# "There is nothing in Scikit-Learn to handle Pandas DataFrames,
# but we can write a custom transformer for this task"

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# "Our DataFrameSelector will transform the data by selecting the desired attributes, dropping the rest, 
# "and converting the resulting DataFrame to a NumPy array" This way, we can write a pipeline that only
# takes the numerical attributes and does what it needs, and a pipeline that only takes the categorical
# attributes and does what it respectively needs.

In [81]:
# CategoricalEncoder isn't working so let's make our own transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, sparse=True):
        self.sparse = sparse
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print(X.shape)
        X, categories = pd.factorize(X.tolist())
        self.categories = categories
        return OneHotEncoder(sparse=self.sparse).fit_transform(X.reshape(-1,1))
    

In [90]:
num_attribs = list(housing_numerical)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(categories='auto'))
])
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'income_cat',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [91]:
# Now let's merge these two pipelines into one with FeatureUnion
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

In [92]:
# Now, finally
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 21)

In [93]:
# Linear Regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

ValueError: Input contains NaN