In [2]:
import import_ipynb
from GettingData import load_housing_data, stratified_split_train_test

In [3]:
housing = load_housing_data()
housing, _ = stratified_split_train_test(housing)
# Let's add some attributes
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

In [5]:
# Removing missing features
option = -1
# Option 1: dropna(), removes districts that have designated missing attribute
if option==1:
    housing.dropna(subset=['total_bedrooms'])
# Option 2: drop(), dropts the whole attribute all together
if option==2:
    housing.drop('total_bedrooms', axis=1)
# Option 3: fillna(), fills any missing features in attribute with designated value, in this case, the median total_bedrooms
if option==3:
    median = housing['total_bedrooms'].median()
    housing['total_bedrooms'].fillna(median, inplace=True)

In [8]:
# Using Imputer to fill missing values
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy='median')

# Median can only be calculated on numerical data, so lets make a copy of data without any text attributes
housing_numerical = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_numerical) # Just finds the median of all attributes

imputer.statistics_

array([-1.18510000e+02,  3.42600000e+01,  2.90000000e+01,  2.11950000e+03,
        4.33000000e+02,  1.16400000e+03,  4.08000000e+02,  3.54090000e+00,
        1.79500000e+05,  3.00000000e+00,  5.23228423e+00,  2.03031374e-01,
        2.81765270e+00])

In [14]:
import pandas as pd

x = imputer.transform(housing_numerical)

# x is a numpy array, so let's convert back to pd.DataFrame
housing_tr = pd.DataFrame(x, columns=housing_numerical.columns)

In [19]:
# Handling text and categorical attributes
housing_cat = housing['ocean_proximity']
housing_cat.head(10)

17606     <1H OCEAN
18632     <1H OCEAN
14650    NEAR OCEAN
3230         INLAND
3555      <1H OCEAN
19480        INLAND
8879      <1H OCEAN
13685        INLAND
4937      <1H OCEAN
4861      <1H OCEAN
Name: ocean_proximity, dtype: object

In [21]:
# Let's convert to numerical with pd.DataFrame.factorize()
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
housing_categories

Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')

In [23]:
# Convert to one-hot vectors
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) 
# We use reshape because fit_transform() wants a 2D array, and we only have 1D

In [24]:
# Alternatively we can go from text -> one-hot instead of text -> integer -> one-hot by using CategoricalEncoder
from sklearn.preprocessing import CategoricalEncoder
cat_encoder = CategoricalEncoder()
housing_cat_reshaped = housing_cat.values.reshape(-1,1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)

ImportError: cannot import name 'CategoricalEncoder'