In [5]:
import math
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib import gridspec
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

In [6]:
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [13]:
california_housing_dataframe = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv', sep=',')

In [14]:
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))

In [16]:
california_housing_dataframe['median_house_value'] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
7242,-118.3,34.2,36.0,1834.0,316.0,864.0,309.0,4.8,302.2
12104,-121.4,37.7,52.0,994.0,258.0,623.0,264.0,1.7,111.5
6501,-118.3,34.0,44.0,1722.0,457.0,2177.0,401.0,2.1,92.5
10829,-120.8,37.7,16.0,1343.0,241.0,732.0,195.0,3.6,187.5
14474,-122.1,38.1,31.0,3401.0,616.0,1750.0,602.0,4.7,143.1
...,...,...,...,...,...,...,...,...,...
8581,-118.5,34.4,5.0,4222.0,712.0,2024.0,646.0,5.9,500.0
1135,-117.1,32.8,33.0,2279.0,591.0,1250.0,576.0,2.4,139.0
8819,-118.7,34.3,10.0,3753.0,678.0,1859.0,660.0,5.0,204.6
5839,-118.2,34.0,40.0,1082.0,318.0,1085.0,273.0,1.7,117.2


In [17]:
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


In [19]:
first_feature = california_housing_dataframe[['total_rooms']]
feature_columns = [tf.feature_column.numeric_column('total_rooms')]

target = california_housing_dataframe['median_house_value']

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)

linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=optimizer
)

In [20]:
def input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    features = {key: np.array(value) for key, value in dict(features).items()}
    
    ds = Dataset.from_tensor_slices((features, targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        ds = ds.shuffle(buffer_size=10000)
        
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [21]:
_ = linear_regressor.train(
    input_fn = lambda:input_fn(first_feature, target),
    steps=100
)

In [24]:
prediction_input_fn = lambda:input_fn(first_feature, target, num_epochs=1, shuffle=False)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
mean_squared_error = metrics.mean_squared_error(predictions, target)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Error cuadratico medio : %0.3f" % mean_squared_error)
print("Raiz error cuadratico medio: %0.3f" % root_mean_squared_error)

Error cuadratico medio : 56367.025
Raiz error cuadratico medio: 237.417


In [25]:
min_house_value = california_housing_dataframe['median_house_value'].min()
max_house_value = california_housing_dataframe['median_house_value'].max()
min_max_difference = max_house_value - min_house_value

print("Min Median House value: %0.3f" % min_house_value)
print("Max Median House value: %0.3f" % max_house_value)
print("Diferencia entre min y max: %0.3f" % min_max_difference)
print("Raiz error cuadratico medio: %0.3f" % root_mean_squared_error)

Min Median House value: 14.999
Max Median House value: 500.001
Diferencia entre min y max: 485.002
Raiz error cuadratico medio: 237.417
