<a href="https://colab.research.google.com/github/Tristan07999/TensorFlow/blob/master/train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import print_function

import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")



In [3]:
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0 
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
8172,-118.4,34.0,16.0,14891.0,3984.0,6270.0,3595.0,5.1,283.2
2577,-117.7,33.6,15.0,3485.0,519.0,1740.0,485.0,6.8,251.9
1688,-117.2,32.8,18.0,2539.0,616.0,964.0,526.0,3.4,275.0
11304,-121.2,38.8,20.0,2104.0,370.0,745.0,314.0,4.2,217.5
2722,-117.7,33.8,5.0,3178.0,631.0,1467.0,581.0,5.3,237.1
...,...,...,...,...,...,...,...,...,...
12859,-121.8,37.4,24.0,2298.0,575.0,2409.0,569.0,3.5,182.4
15509,-122.3,37.6,26.0,2339.0,704.0,1283.0,654.0,3.2,415.0
10488,-120.4,37.3,28.0,1401.0,292.0,967.0,257.0,1.6,89.4
10177,-119.8,36.8,14.0,1876.0,324.0,1031.0,311.0,3.7,88.8


In [4]:
california_housing_dataframe.describe()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


**Step 1: Define features; configure feature columns; Define the target**


In [0]:
#Define input feature: total_rooms
my_feature = california_housing_dataframe[["total_rooms"]]

#configure a numeric feature column for total_rooms
feature_columns = [tf.feature_column.numeric_column("total_rooms")]

#define the target (lable)
targets = california_housing_dataframe["median_house_value"]



**Step 3: Configure the LinearRegressor**

In [0]:
#Configure a linear regression model using LinearRegressor
#Use gradient descent as the optimizer for training the model
my_optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

#Configure the linear regression model with our feature columns and optimizer
linear_regressor = tf.estimator.LinearRegressor(feature_columns = feature_columns, optimizer = my_optimizer)

**Step 4: Define the input function**

In [0]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
  
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

**Step 5: Train the model**

In [0]:
_ = linear_regressor.train(input_fn = lambda:my_input_fn(my_feature, targets), steps = 100)

**Step 6: Evaluate the Model**

In [11]:
# Create an input function for prediction
prediction_input_fn = lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)

#Call predict() on the linear_regressor to make prediction
predictions = linear_regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error (on training data): %0.3f" % mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)

Mean Squared Error (on training data): 56367.025
Root Mean Squared Error (on training data): 237.417


In [12]:
#Compare RMSE to difference of min and max of our target
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value

print("Min. Median House Value: %0.3f" % min_house_value)
print("Max. Median House Value: %0.3f" % max_house_value)
print("Difference between Min. and Max.: %0.3f" % min_max_difference)
print("Root Mean Squared Error: %0.3f" % root_mean_squared_error)

Min. Median House Value: 14.999
Max. Median House Value: 500.001
Difference between Min. and Max.: 485.002
Root Mean Squared Error: 237.417


In [14]:
#RMSE is almost half way of the range of target value, can we do better?
#Next is some basic strategies of reducing the model error

#first is look at how well our predictions match our target? 

calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(targets)
calibration_data.describe()

Unnamed: 0,predictions,targets
count,17000.0,17000.0
mean,0.1,207.3
std,0.1,116.0
min,0.0,15.0
25%,0.1,119.4
50%,0.1,180.4
75%,0.2,265.0
max,1.9,500.0
