In [80]:
#!/usr/bin/env python

import numpy as np

## ================ Part 1: Feature Normalization ================

def featureNormalize(X):
    X_norm = X
    mu    = np.zeros((1, X.shape[1]))
    sigma = np.zeros((1, X.shape[1]))
    
    m = X.shape[1]
    
    for i in np.arange(m):
        mu[:,i] = np.mean(X[:,i])
        sigma[:,i] = np.std(X[:,i])
        X_norm[:, i] = (X[:,i] - float(mu[:,i])) / float(sigma[:,i])
    
    return X_norm, mu, sigma
        
# Loading data 
data = np.loadtxt('ex1data2.txt', delimiter=",")

X = data[:, :2]
y = data[:, 2]



# Normalizing Features...
X_norm, mu, sigma = featureNormalize(X)

# from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# number of training examples
m = len(y_train) 
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# lr = LinearRegression()
# lr.fit(X_train_std, y_train)
# y_pred = lr.predict(X_test_std)
# for f in range(y_pred.shape[0]):
#     print("%2d) %-*s %2f" % (f + 1, 30, y_pred[f], y_test[f]))

# Add intercept term to X
X_padded = np.column_stack((np.ones((m,1)), X_train_std)) # Add a column of ones to x

## ================ Part 2: Gradient Descent ================
def computeCost(X, y, theta):
    m = len(y)
    h = np.power(( X.dot(theta) - np.transpose([y]) ), 2)
    J = (1.0/(2*m)) * h.sum( axis = 0 )
    return J
    
def gradientDescentMulti(X, y, theta, alpha, iters):
   
    # Initialize some useful values
    m = len(y) # number of training examples
    J_history = np.zeros((num_iters, 1))
    for i in np.arange(iters):
        theta = theta - alpha*(1.0/m) * np.transpose(X).dot(X.dot(theta) - np.transpose([y]))
        J_history[i] = computeCost(X, y, theta)
        
    return theta, J_history
    
# Choose some alpha value
alpha = 0.01
num_iters = 400

# Init Theta and Run Gradient Descent 
theta = np.zeros((3, 1))

theta, J_history = gradientDescentMulti(X_padded, y_train, theta, alpha, num_iters)

# # Plot the convergence graph
# plt.plot(range(J_history.size), J_history, "-b", linewidth=2 )
# plt.xlabel('Number of iterations')
# plt.ylabel('Cost J')
# plt.show(block=False)

# Estimate the price of a 1650 sq-ft, 3 br house
# Recall that the first column of X is all-ones. Thus, it does
# not need to be normalized.
# area_norm = (1650 - float(mu[:,0])) / float(sigma[:,0])
# br_norm = (3 - float(mu[:,1]))/float(sigma[:,1])
# house_norm_padded = np.array([1, area_norm, br_norm])
n = y_test.size
X_test_padded = np.column_stack((np.ones((n,1)), X_test_std))
y_pred = X_test_padded.dot(theta)

for f in range(y_pred.shape[0]):
    print("%2d) %-*s %2f" % (f + 1, 30, y_pred[f], y_test[f]))

# # # ============================================================

# print("Predicted price of a 1650 sq-ft, 3 br house (using gradient descent):\n ${:,.2f}".format(price[0]))

 1) [ 353344.55392053]             475000.000000
 2) [ 464153.94718709]             579900.000000
 3) [ 299644.71659601]             349900.000000
 4) [ 458960.42501669]             539900.000000
 5) [ 412720.85963011]             499998.000000
 6) [ 329692.60231615]             347000.000000
 7) [ 407753.94979319]             314000.000000
 8) [ 189416.714937]               169900.000000
 9) [ 282861.73813946]             242900.000000
10) [ 333605.08151344]             239999.000000
11) [ 391897.23503568]             469000.000000
12) [ 345431.05731563]             314900.000000
13) [ 405024.06817612]             299900.000000
14) [ 235084.7958986]              299000.000000
15) [ 376996.50552492]             369000.000000
