In [47]:
from __future__ import print_function
import math
from IPython import display
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset

In [4]:
df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv",sep=',')

In [6]:
# shuffle
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
9031,-118.96,34.23,14.0,15207.0,2924.0,6301.0,2829.0,3.9699,217000.0
12136,-121.44,38.5,27.0,2527.0,439.0,1089.0,415.0,4.088,96800.0
2189,-117.38,34.08,11.0,5684.0,1139.0,3095.0,1036.0,3.6875,112600.0
14889,-122.23,40.32,10.0,2336.0,426.0,1003.0,368.0,3.0833,81300.0
2034,-117.32,34.02,17.0,1779.0,292.0,1006.0,293.0,4.6708,123100.0


In [7]:
def preprocess_features(df):
    # input: pandas.DataFrame
    # output: DataFrame contains features to be used for the model, including synthetic features
    selected_features = df[
        [
            'latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income'
        ]
    ]
    processed_features = selected_features.copy()
    # Create a synthetic feature
    processed_features['rooms_per_person'] = processed_features['total_rooms']/processed_features['population']
    return processed_features

In [44]:
def preprocess_targets(df):
    # input: DataFrame
    # output: DataFrame contains the target feature
    output_targets = pd.DataFrame()
    # scale the target to be in units of thousands of dollars
    output_targets['median_house_value'] = df['median_house_value']/1000
    return output_targets

In [45]:
# split dataset
training_examples = preprocess_features(df.head(12000))
training_targets = preprocess_targets(df.head(12000))

validation_examples = preprocess_features(df.head(5000))
validation_targets = preprocess_targets(df.head(5000))

print('Traning examples summary:')
display.display(training_examples.describe())
print('Validation examples summary:')
display.display(validation_examples.describe())

Traning examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.616659,-119.553757,28.568083,2653.093917,541.549917,1434.033917,503.283167,3.866351,1.969137
std,2.138949,2.008624,12.612855,2219.611931,427.108088,1120.280905,388.963263,1.902078,1.027615
min,32.54,-124.35,1.0,2.0,1.0,3.0,1.0,0.4999,0.018065
25%,33.93,-121.7925,18.0,1462.0,297.0,792.0,282.0,2.5625,1.516524
50%,34.24,-118.47,29.0,2127.0,434.0,1166.0,410.0,3.5234,1.940994
75%,37.71,-118.0,37.0,3150.0,651.0,1728.0,607.0,4.74655,2.296943
max,41.95,-114.47,52.0,37937.0,5471.0,16122.0,5189.0,15.0001,52.033333


Validation examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.6033,-119.531652,28.6984,2617.6684,536.1922,1421.9592,497.4612,3.823746,1.964006
std,2.134405,2.014432,12.512328,2154.498005,423.945049,1080.904343,381.039872,1.854667,1.11063
min,32.54,-124.23,2.0,2.0,1.0,3.0,1.0,0.4999,0.018065
25%,33.92,-121.77,18.0,1458.75,296.0,792.0,282.0,2.55665,1.490332
50%,34.22,-118.46,29.0,2112.5,430.0,1164.0,407.0,3.51905,1.933702
75%,37.71,-117.98,37.0,3134.25,642.0,1717.25,598.0,4.6902,2.296752
max,41.95,-114.47,52.0,26322.0,4457.0,10329.0,4204.0,15.0001,52.033333


In [46]:
# find strongly correlated features
cor = training_examples.copy()
cor['target'] = training_targets['median_house_value']
cor.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person,target
latitude,1.0,-0.924868,0.010342,-0.033795,-0.066408,-0.110046,-0.07009,-0.067216,0.150086,-0.134158
longitude,-0.924868,1.0,-0.108649,0.044104,0.070333,0.102931,0.057366,-0.029839,-0.087623,-0.057176
housing_median_age,0.010342,-0.108649,1.0,-0.359935,-0.31975,-0.303728,-0.301876,-0.115298,-0.114136,0.109566
total_rooms,-0.033795,0.044104,-0.359935,1.0,0.92931,0.872926,0.91958,0.196973,0.145506,0.131347
total_bedrooms,-0.066408,0.070333,-0.31975,0.92931,1.0,0.891162,0.980915,-0.007151,0.064126,0.050352
population,-0.110046,0.102931,-0.303728,0.872926,0.891162,1.0,0.919922,0.00467,-0.148304,-0.027504
households,-0.07009,0.057366,-0.301876,0.91958,0.980915,0.919922,1.0,0.011847,-0.022458,0.063316
median_income,-0.067216,-0.029839,-0.115298,0.196973,-0.007151,0.00467,0.011847,1.0,0.26816,0.687152
rooms_per_person,0.150086,-0.087623,-0.114136,0.145506,0.064126,-0.148304,-0.022458,0.26816,1.0,0.24995
target,-0.134158,-0.057176,0.109566,0.131347,0.050352,-0.027504,0.063316,0.687152,0.24995,1.0


In [48]:
def construct_feature_columns(input_features):
    # input: numerical input features to use
    # output: a set of feature columns to use
    return set([tf.feature_column.numeric_column(feature) for feature in input_features])

In [49]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """ input:
            features : DataFrame of features,
            targets: DataFrame of targets
            batch_size: size of batches to be passed to the model
            shuffle: weather to shuffle the data
            num_epochs: num of epochs which the data should be repeated
        output:
            tuple of (features, labels) for next data batch 
    """
    # convert pandas data into a dict of numpy arrays
    features = {key:np.array(value) for key,value in dict(features).items()}
    
    # construct a dataset, and configure batching and repeating
    ds = Dataset.from_tensor_slices((features, targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # shuffle the data if specified
    if shuffle:
        ds = ds.shuffle(10000)
    
    # return the next batch of the data
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, lebels

In [50]:
def train_model():
    """
        training a linear regression model
        
        input: 
            learning_rate: controls how much we adjust the weights with respect of loss gradient 
            steps: num of training steps
            batch_size,
            traning_examples,
            traning_targets,
            validation_examples,
            validation_targets
        
        output:
            a 'LinearRegressor' object trained on the training data
    """
    periods = 10
    steps_per_period = steps / periods
    
    # create a linear regressor object
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer,5.0)
    linear_regressor = tf.estimator.LinearRegressor(
    feature_columns = construct_feature_columns(training_examples),
    optimizer = my_optimizer
    )
    
    # create input function
    training_input_fn = lambda: my_input_fn(training_examples,
                                            training_targets['median_house_value'],
                                            batch_size = batch_size)
    predict_training_input_fn = lambda : my_input_fn(training_examples,
                                                    training_targets['median_house_value'],
                                                    num_epochs = 1,
                                                    shuffle = True)
    predict_validation_input_fn = lambda : my_input_fn(validation_examples,
                                                      validation_targets['median_house_value'],
                                                      num_epochs = 1,
                                                      shuffle = False)
    # train the model
    print('training models ...')
    print('RMSE (on traning data):')
    traning_rmse, validation_rmse = [], []
    
    for period in range(0, periods):
        # train the model, start from the prior state
        linear_regressor.train(
            input_fn = training_input_fn,
            steps = steps_per_period,
        )
        
        # take a break and compute predictions
        training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        
        