# Boston Housing Prices: Project 1 

       - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

In [None]:
%matplotlib inline
"""Load the Boston dataset and examine its target (label) distribution."""
# Load libraries
import os
import numpy as np
import pylab as pl
import matplotlib.pyplot as pl

from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression

import seaborn as sns
sns.set(style="ticks", color_codes=True)
sns.set_context('notebook')
import pandas as pd

boston = load_data()

def load_data():
    """Load the Boston dataset."""
    boston = datasets.load_boston()
    #print boston.keys()
    #print boston.DESCR
    return boston

def histogram(city_data):
    housing_prices = city_data.target
    housing_features = city_data.data
    X,y = housing_features, housing_prices
    pl.hist(y, bins =20, color = 'green')
    pl.suptitle('Boston Housing Prices', fontsize = 24)
    pl.xlabel('Housing Prices [$10k]', fontsize = 16)
    pl.ylabel('Frequency', fontsize = 16)
    pl.show()
    
def dataframe(city_data):
    housing_prices = city_data.target
    housing_features = city_data.data
    X,y = housing_features, housing_prices
    df_data = pd.DataFrame(housing_features, columns = boston.feature_names)
    df_target = pd.DataFrame(housing_prices, columns =['MEDV'])
    df_boston = pd.concat([df_data, df_target,], axis = 1)
    print(df_boston)
    return df_boston
    
def scatter_plots(city_data):
    pl.figure()
    fig,axes = pl.subplots(4, 4, figsize=(14,18))
    fig.subplots_adjust(wspace=.4, hspace=.4)
    img_index = 0
    for i in range(boston.feature_names.size):
        row, col = i // 4, i % 4
        axes[row][col].scatter(boston.data[:,i],boston.target)
        axes[row][col].set_title(boston.feature_names[i] + ' and MEDV')
        axes[row][col].set_xlabel(boston.feature_names[i])
        axes[row][col].set_ylabel('MEDV')
    filepath = '/Users/tracesmith/Desktop/Udacity/class/machine_learning/Project_1/images'
    filename = ("MEDV vs Features")
    filename = os.path.join(filepath, filename)
    pl.savefig(filename)
    
def explore_city_data(city_data):
    """Calculate the Boston housing statistics."""

    # Get the labels and features from the housing data
    housing_prices = city_data.target
    housing_features = city_data.data

    # Size of data (number of houses)?
    number_houses = housing_features.shape[0]
    print "Number of houses:", number_houses
   
    # Number of features?
    number_features = housing_features.shape[1]
    print "Number of features:", number_features
   
    # Minimum price?
    min_price = np.min(housing_prices)
    print "Minimum Housing Price: ${:,.0f}".format(min_price*1000)

    # Maximum price?
    
    max_price = np.max(housing_prices)
    print "Maximum Housing Price: ${:,.0f}".format(max_price*1000)

    # Calculate mean price?
    
    mean_price = np.mean(housing_prices)
    print "Mean Housing Price: ${:,.0f}".format(mean_price*1000)

    # Calculate median price?
    
    median_price = np.median(housing_prices)
    print "Median Housing Price: ${:,.0f}".format(median_price*1000)

    # Calculate standard deviation?
    
    std_price = np.std(housing_prices)
    print "Standard Deviation: ${:,.0f}".format(std_price*1000) 


def split_data(city_data):
    """Randomly shuffle the sample set. Divide it into 70 percent training and 30 percent testing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=None)

    #print "X_training:", X_train.shape
    #print "X_test:", X_test.shape 
    
    return X_train, y_train, X_test, y_test


def performance_metric(label, prediction):
    """Calculate and return the appropriate error performance metric."""
    
    # The following page has a table of scoring functions in sklearn:
    # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
    #mae = mean_absolute_error(label, prediction)
    mse = mean_squared_error(label, prediction)
    return mse
    pass

def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print (depth)

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))


    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err)


def learning_curve_graph(sizes, train_err, test_err):
    """Plot training and test error as a function of the training size."""
    pl.figure()
    pl.title('Decision Trees: Performance vs Training Size', fontsize = 20)
    pl.plot(sizes, test_err, lw=2, label = 'test error')
    pl.plot(sizes, train_err, lw=2, label = 'training error')
    pl.legend()
    pl.xlabel('Training Size', fontsize = 14)
    pl.ylabel('Error', fontsize =14)
    #pl.show()
    filepath = '/Users/tracesmith/Desktop/Udacity/class/machine_learning/Project_1/images'
    filename = ("learning_curve"+"_"+str(max_depth)+".png")
    filename = os.path.join(filepath, filename)
    pl.savefig(filename)

def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""

    print "Model Complexity: "

    # We will vary the depth of decision trees from 2 to 25
    max_depth = np.arange(1, 25)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    model_complexity_graph(max_depth, train_err, test_err)


def model_complexity_graph(max_depth, train_err, test_err):
    """Plot training and test error as a function of the depth of the decision tree learn."""
    pl.figure()
    pl.title('Decision Trees: Performance vs Max Depth', fontsize = 20)
    pl.plot(max_depth, test_err, lw=2, label = 'test error')
    pl.plot(max_depth, train_err, lw=2, label = 'training error')
    pl.legend()
    pl.xlabel('Max Depth',fontsize =14)
    pl.ylabel('Error', fontsize =14)
    #pl.show()
    filepath = '/Users/tracesmith/Desktop/Udacity/class/machine_learning/Project_1/images'
    filename = ("Model Complexity")
    filename = os.path.join(filepath, filename)
    pl.savefig(filename)

    
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    
    regressor = DecisionTreeRegressor()
    
    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}
    
    mse_scoring = make_scorer(mean_squared_error, greater_is_better=False)
    
    #use grid search to fine tune the Decision Tree Regressor and
    #obtain the parameters that generate the best training performance. 

    reg = GridSearchCV(regressor, parameters, scoring =mse_scoring)
    reg.fit(X,y)

    
    # Fit the learner to the training data to obtain the best parameter set
    print "Final Model: "
    print (reg.fit(X, y))    

    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    x = np.array(x)
    x = x.reshape(1, -1)
    y = reg.predict(x)
    
    print "Best Parameters: ", reg.best_params_
    print "Best Estimator:", reg.best_estimator_
    #print "Grid Score:", reg.grid_scores_

    print "House: " + str(x)
    print "Predicted: " + str(y)
    
def main():
    """Analyze the Boston housing data. Evaluate and validate the
    performanance of a Decision Tree regressor on the housing data.
    Fine tune the model to make prediction on unseen data."""

    # Load data
    city_data = load_data()
    
    # Explore the data
    #explore_city_data(city_data)

    # Training/Test dataset split
    X_train, y_train, X_test, y_test = split_data(city_data)

    # Learning Curve Graphs
    #max_depths = [1,2,3,4,5,6,7,8,9,10]
    #for max_depth in max_depths:
        #learning_curve(max_depth, X_train, y_train, X_test, y_test)
    
    # Model Complexity Graph
    #model_complexity(X_train, y_train, X_test, y_test)

    #Tune and predict Model
    #fit_predict_model(city_data)

    

if __name__ == "__main__":
    main()

In [None]:
regression = LinearRegression()
regression.fit(X,y)
pd.DataFrame(zip(boston.feature_names, regression.coef_), columns = ['Features', 'Estimated Coef.'])


In [None]:
#DataFrame:
dataframe(city_data)

In [709]:
#Statistics - Shape, Mean, Median, etc.:
explore_city_data(city_data)

Number of houses: 506
Number of features: 13
Minimum Housing Price: $5,000
Maximum Housing Price: $50,000
Mean Housing Price: $22,533
Median Housing Price: $21,200
Standard Deviation: $9,188


In [None]:
#Histogram of Data:
histogram(city_data)

In [None]:
#Scatter Plots:
scatter_plots(city_data)

In [None]:
# Learning Curve Graphs
max_depths = [1,2,3,4,5,6,7,8,9,10]
for max_depth in max_depths:
    learning_curve(max_depth, X_train, y_train, X_test, y_test)

In [None]:
# Model Completxity 
learning_curve(max_depth, X_train, y_train, X_test, y_test)
learning_curve_graph(sizes, train_err, test_err)


In [None]:
# Model Complexity Graph
model_complexity(X_train, y_train, X_test, y_test)

In [None]:
#Fit Model:
fit_predict_model(city_data)

In [None]:
#DataFrame of Client_Features
x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
pd.DataFrame(zip(boston.feature_names, x), columns = ['Features', 'Client_Features'])