In [5]:
# From here on, operate on train_set only.
# Later, we will automate all processing so it can be repeated on test_set.
# This page represents exploration of the ideas.

import pandas as pd
datapath="/Users/jasonmiller/Source/MachineLearning/datasets/housing/housing.csv"
all_data=pd.read_csv(datapath)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
train_predictors = train_set.drop(["median_house_value"],axis=1)
train_labels = train_set["median_house_value"].copy()
train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [8]:
# Return to issue of missing data.
# Recall there were 207 null bedroom counts 
# By chance, all of them ended up in test_set, but we will process train_set anyway.

# Generalize the fix to all columns since real data could have NaN anywhere.
# Choice 1 = remove whole rows i.e. data points: df.dropna(subset=["total_bedrooms"])
# Choice 2 = remove whole columns i.e. features: df.drop("total_bedrooms",axis=1)
# Choice 3 = change NaN to 3000 i.e. df["total_bedrooms"].fillna(3000,inplace=True)
# Choice 4 = use an imputer. Hits every column for us. 
#           Unfortunately, requires us to put aside non-numeric columns.
#           Using median impute will perhaps have the least effect on outcome.
from sklearn.impute import SimpleImputer
def cleanse_NaN (df):
    imputer=SimpleImputer(strategy="median")
    numeric_only_df = df.drop("ocean_proximity",axis=1)  # returns new data frame; original is not changed
    imputer.fit(numeric_only_df)
    numpy_array = imputer.transform(numeric_only_df)  # replace NaN with column median in every column
    transformed_df = pd.DataFrame(numpy_array,columns=numeric_only_df.columns,
                                  index=numeric_only_df.index)
    transformed_df["ocean_proximity"] = df["ocean_proximity"]
    transformed_df.describe()
    return transformed_df
train_set = cleanse_NaN(train_set)
train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [22]:
# Return to issue of categorical data.
# Seek to replace the quirky text in ocean_proximity.
train_set["ocean_proximity"].unique()

array(['NEAR OCEAN', 'INLAND', '<1H OCEAN', 'NEAR BAY', 'ISLAND'],
      dtype=object)

In [13]:
# This notation would extract column as pandas Series.
type(train_set["ocean_proximity"])

pandas.core.series.Series

In [23]:
# In pandas, single brackets extracts a Series, double brackets extracts a DataFrame.
type(train_set[["ocean_proximity"]])

pandas.core.frame.DataFrame

In [31]:
# First try: Convert text to ordinal.
import numpy as np
proximity = train_set[["ocean_proximity"]]
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
numpy_array = encoder.fit_transform(proximity)
np.unique(numpy_array)
# This is bad since we don't want ML to think (0,1) are similar while (0,4) are different.

array([0., 1., 2., 3., 4.])

In [35]:
# Second try: One Hot Encoding.
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
scipi_sparse_matrix = encoder.fit_transform(proximity)
scipi_sparse_matrix.toarray()
# This is good. Luckily, sklearn can deal with this data type.

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [44]:
# Return to isse of feature scaling.
# Most ML won't work with different ranges and different skew.

# First try: min-max scaling (i.e. normalization). 
# All values mapped to range (0,1): new=(old-min)/(max-min).
# Requires all columns are numeric.
numeric_only_df = train_set.drop("ocean_proximity",axis=1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(numeric_only_df)
numpy_array = scaler.transform(numeric_only_df)
numpy_array

array([[0.72908367, 0.01702128, 0.62745098, ..., 0.10228581, 0.19032151,
        0.18144461],
       [0.61653386, 0.12978723, 0.94117647, ..., 0.12415721, 0.22845202,
        0.75690616],
       [0.38545817, 0.22446809, 0.05882353, ..., 0.05508962, 0.25216204,
        0.32494918],
       ...,
       [0.59462151, 0.15744681, 0.68627451, ..., 0.08649893, 0.16789424,
        0.42701061],
       [0.23804781, 0.53510638, 0.2745098 , ..., 0.09176122, 0.35994676,
        0.55360803],
       [0.19223108, 0.55531915, 1.        , ..., 0.20407828, 0.14314285,
        0.63917468]])

In [45]:
# Second try: standardization (i.e. z-scores with unit variance)
numeric_only_df = train_set.drop("ocean_proximity",axis=1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(numeric_only_df)
numpy_array = scaler.transform(numeric_only_df)
numpy_array

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.32290591,
        -0.326196  , -0.90118909],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.6720272 ,
        -0.03584338,  1.5127714 ],
       [-0.44760309, -0.46014647, -1.95271028, ..., -0.43046109,
         0.14470145, -0.29921255],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.07090859,
        -0.49697313,  0.12891731],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.15490769,
         0.96545045,  0.65997132],
       [-1.41489815,  0.99543676,  1.85617335, ...,  1.94776365,
        -0.68544764,  1.01890847]])