In [362]:
print(__doc__)

# Code source: Vaibhav Satam
# License: MachineLearn

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from __future__ import print_function

%matplotlib inline

class LinearRegression(object):
    def __init__(self, alpha=0.1, n_iters=100):
        self.n_iters = n_iters
        self.alpha = alpha
        self.coef_ = np.zeros((X_train.shape[1] + 1, 1))
    
    def fit(self, X, y):
        # num of training examples
        train_size = y.size
        J_history = np.zeros((self.n_iters, 1))
        
        for i in np.arange(self.n_iters):
            h_of_theta = np.dot(X, self.coef_)
            # Calculate mean square error             
            J_Cost = (1./ (2*train_size)) * np.power((h_of_theta - y), 2).sum()
            # update weights to fit training data well             
            self.coef_ = self.coef_ - self.alpha*((1./train_size) * np.dot(X.T, (h_of_theta - y)))
            J_history[i] = J_Cost
            
        return self.coef_, J_history
    
    def predict(self, X):
        # calculate dot product of test set with learned parameters       
        return np.dot(X, self.coef_)
    
    

df = pd.read_csv('ex1data1.txt', sep=',', header=None)
df.columns = ['input', 'prices']

# split data
X = df.iloc[:, :1].values
y = df.iloc[:, 1:].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Feature scaling for bringing all features onto the same scale
# No need of this for univariate
sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
X_test_norm = sc.transform(X_test)

# Test and train feature size
train_size = X_train_norm.shape[0]
test_size = X_test_norm.shape[0]

# Adding bias input features
X_train_padded = np.column_stack((np.ones((train_size, 1)), X_train_norm))
X_test_padded = np.column_stack((np.ones((test_size, 1)), X_test_norm))

# # Plot to see if standerdized features
# plt.plot(X_train_norm,y_train,'rx', label='Training data')
# plt.show()

# from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_padded, y_train)

y_pred = lr.predict(X_test_padded)

# # Plot for convergence per iterations
# plt.plot(range(cost.size), cost, "-b", linewidth=2 )
# plt.xlabel('Number of iterations')
# plt.ylabel('Cost J')
# plt.show(block=False)

# # Plot outputs 
# plt.scatter(X_test, y_test,  color='black')
# plt.xlabel('X')
# plt.ylabel('Y')
# plt.plot(X_test, y_pred, color='blue', linewidth=3)
# plt.show()

for f in range(y_pred.shape[0]):
    print("%2d) %-*s %2f" % (f + 1, 30, y_pred[f, 0], y_test[f]))


Automatically created module for IPython interactive environment
 1) 11.514126835                   13.501000
 2) 3.94802773772                  5.343600
 3) 6.34859595782                  13.662000
 4) 3.75983718969                  -1.421100
 5) 2.72958383913                  1.017300
 6) 2.35679874078                  0.204210
 7) 3.76762851811                  5.304800
 8) 8.45273409811                  6.752600
 9) 3.72855200941                  1.423300
10) 6.0806941267                   5.744200
11) 5.25121731624                  4.998100
12) 2.20217083826                  2.821400
13) 4.92925565252                  4.025900
14) 6.21650297442                  7.225800
15) 6.41991657953                  12.000000
16) 2.43351335909                  -1.220000
17) 2.71843624615                  4.263000
18) 2.8080964563                   3.082500
19) 3.19994034263                  1.849500
20) 3.91242736017                  6.598700
21) 3.05837789853                  2.440600
22) 21