#### Problem 1: Linear Regression Model

In [1]:
# Import necessary packages to the jupyter notebook
# Implement a Linear Regression model using both Normal Equation Method and SGD
import pandas as pd
import numpy as np
from pandas import read_csv

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

# read and load the csv data file
filename = "Dataset/AMZN.csv"
data = read_csv ( filename )

# Get the Adjusted Close Price
data_select = data [['Adj Close']]

# converting the dataset to a numpy array
values = data_select.values

In [2]:
from pandas import DataFrame
from pandas import concat

"""
Frame a time series as a supervised learning dataset .
Arguments :
data : Sequence of observations as a list or NumPy array .
n_in : Number of lag observations as input (X).
n_out : Number of observations as output (y).
dropnan : Boolean whether or not to drop rows with NaN values .
Returns :
Pandas DataFrame of series framed for supervised learning .
"""

def series_to_supervised ( data , n_in =1 , n_out =1 , dropnan = True ):
    n_vars = 1 if type ( data ) is list else data . shape [1]
    df = DataFrame ( data )
    cols , names = list () , list ()
    # input sequence (t-n, ... t-1)
    for i in range ( n_in , 0 , -1 ):
        cols . append ( df . shift ( i ) )
        names += [('var %d(t-%d)' % ( j+1 , i ) ) for j in range ( n_vars )]
    # forecast sequence (t, t+1, ... t+n)
    for i in range (0 , n_out ):
        cols.append ( df . shift (-i ) )
        if i == 0:
            names += [('var%d(t)' % ( j+1 ) ) for j in range ( n_vars )]
        else :
            names += [('var%d(t+%d)' % ( j+1 , i ) ) for j in range ( n_vars )]
    # put it all together
    agg = concat ( cols , axis =1 )
    agg.columns = names
    # drop rows with NaN values
    if dropnan :
        agg.dropna( inplace = True )
    return agg

###### (a) 
Use the Python function named series_to_supervised() that takes a univariate or multivariate time series and frames it as a supervised learning dataset.

In [3]:
series_to_supervised(data_select, n_in=10, n_out=1, dropnan=True)

Unnamed: 0,var 1(t-10),var 1(t-9),var 1(t-8),var 1(t-7),var 1(t-6),var 1(t-5),var 1(t-4),var 1(t-3),var 1(t-2),var 1(t-1),var1(t)
10,1.958333,1.729167,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000
11,1.729167,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417
12,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167
13,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167,1.416667
14,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167,1.416667,1.541667
...,...,...,...,...,...,...,...,...,...,...,...
5753,1676.609985,1785.000000,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990
5754,1785.000000,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976
5755,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976,1963.949951
5756,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976,1963.949951,1949.719971


In [4]:
supervised_data = series_to_supervised(data_select, n_in=10, n_out=1, dropnan=True)
supervised_data

Unnamed: 0,var 1(t-10),var 1(t-9),var 1(t-8),var 1(t-7),var 1(t-6),var 1(t-5),var 1(t-4),var 1(t-3),var 1(t-2),var 1(t-1),var1(t)
10,1.958333,1.729167,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000
11,1.729167,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417
12,1.708333,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167
13,1.635417,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167,1.416667
14,1.427083,1.395833,1.500000,1.583333,1.531250,1.505208,1.500000,1.510417,1.479167,1.416667,1.541667
...,...,...,...,...,...,...,...,...,...,...,...
5753,1676.609985,1785.000000,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990
5754,1785.000000,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976
5755,1689.150024,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976,1963.949951
5756,1807.839966,1830.000000,1880.930054,1846.089966,1902.829956,1940.099976,1885.839966,1955.489990,1900.099976,1963.949951,1949.719971


###### (b) 
Use MinMaxScaler to scale your data

In [5]:
scaler = MinMaxScaler()
supervised_data = scaler.fit_transform(supervised_data)

###### (c)
Use the Normal Equation Method to find the linear regression coefficients (w). To perform this you may want to take the following steps first: Split your data to X and Y by taking the columns var1(t-10),...,var(t-1) as your 10 features in X, and take the last column var1(t) as your target (Y). Expand your matrix X with a bias vector of ones as the first column (to accomplish this, you may want to use the numpy operations np.ones , np.reshape and np.append). Use the train test split with ‘random state=1’ to split your data to 70% training, and 30% test data. Solve the Normal Equation Method in (2) to find the coefficients w.

In [7]:
Y = supervised_data[:,-1]
X = supervised_data[:,0:-1]

In [12]:
print(X.shape)
print(Y.shape)

(5748, 10)
(5748,)


In [15]:
def normalEquation(X, Y):
    m = int(np.size(data_select[:, 1]))

    # This is the feature / parameter (2x2) vector that will
    # contain my minimized values
    theta = []

    # I create a bias_vector to add to my newly created X vector
    bias_vector = np.ones((m, 1))

    # I need to reshape my original X(m,) vector so that I can
    # manipulate it with my bias_vector; they need to share the same
    # dimensions.
    X = np.reshape(X, (m, 1))

    # I combine these two vectors together to get a (m, 2) matrix
    X = np.append(bias_vector, X, axis=1)

    # Normal Equation:
    # theta = inv(X^T * X) * X^T * y

    # For convenience I create a new, tranposed X matrix
    X_transpose = np.transpose(X)

    # Calculating theta
    theta = np.linalg.inv(X_transpose.dot(X))
    theta = theta.dot(X_transpose)
    theta = theta.dot(y)

    return theta

p = normalEquation(X, Y)

TypeError: '(slice(None, None, None), 1)' is an invalid key