In [27]:
print(__doc__)

# Code source: Vaibhav Satam
# License: MachineLearn

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from __future__ import print_function

%matplotlib inline

class LinearRegression(object):
    def __init__(self, alpha=0.1, n_iters=100):
        self.n_iters = n_iters
        self.alpha = alpha
    
    def fit(self, X, y, theta):
        # num of training examples
        train_size = y.size
        J_history = np.zeros((self.n_iters, 1))
        
        for i in np.arange(self.n_iters):
            # Calculate mean square error             
            J = (1./(2*train_size)) * np.power((np.dot(X, theta) - y), 2).sum()
            # update weights to fit training data well             
            theta = theta - self.alpha*((1./train_size) * np.dot( X.T, np.dot(X,theta) - y))
            J_history[i] = J
            
        return theta, J_history
    
    def predict(self, X, theta):
        # calculate dot product of test set with learned parameters       
        return np.dot(X, theta)

df = pd.read_csv('ex1data1.txt', sep=',', header=None)
df.columns = ['input', 'prices']

# split data
X = df.iloc[:, :1].values
y = df.iloc[:, 1:].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Feature scaling for bringing all features onto the same scale
# No need of this for univariate
sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
X_test_norm = sc.transform(X_test)

# Test and train feature size
train_size = X_train_norm.shape[0]
test_size = X_test_norm.shape[0]

# Adding bias input features
X_train_padded = np.column_stack((np.ones((train_size, 1)), X_train_norm))
X_test_padded = np.column_stack((np.ones((test_size, 1)), X_test_norm))

# # Plot to see if standerdized features
# plt.plot(X_train_norm,y_train,'rx', label='Training data')
# plt.show()

theta = np.zeros((X_train.shape[1] + 1, 1))

# from sklearn.linear_model import LinearRegression
lr = LinearRegression()
theta, J = lr.fit(X_train_padded, y_train, theta)
print(X_test_padded.shape, theta.shape)
y_pred = lr.predict(X_test_padded, theta)

# # Plot for convergence per iterations
# plt.plot(range(cost.size), cost, "-b", linewidth=2 )
# plt.xlabel('Number of iterations')
# plt.ylabel('Cost J')
# plt.show(block=False)

# # Plot outputs 
# plt.scatter(X_test, y_test,  color='black')
# plt.xlabel('X')
# plt.ylabel('Y')
# plt.plot(X_test, y_pred, color='blue', linewidth=3)
# plt.show()

for f in range(y_pred.shape[0]):
    print("%2d) %-*s %2f" % (f + 1, 30, y_pred[f, 0], y_test[f]))


Automatically created module for IPython interactive environment
 1) 11.5065126171                  13.501000
 2) 3.95148870854                  5.343600
 3) 6.34854299817                  13.662000
 4) 3.76357363216                  -1.421100
 5) 2.73482835731                  1.017300
 6) 2.36258893849                  0.204210
 7) 3.7713535557                   5.304800
 8) 8.44960111972                  6.752600
 9) 3.73233424685                  1.423300
10) 6.08103331937                  5.744200
11) 5.25277068971                  4.998100
12) 2.20818737892                  2.821400
13) 4.93128031063                  4.025900
14) 6.21664337131                  7.225800
15) 6.41975922139                  12.000000
16) 2.43919126262                  -1.220000
17) 2.72369708209                  4.263000
18) 2.81322604842                  3.082500
19) 3.20449635714                  1.849500
20) 3.9159404425                   6.598700
21) 3.06314113089                  2.440600
22) 21