In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Collecting Data

In [2]:
train_data_frame = pd.read_csv("C:/Users/piyus/Data_Science/Projects/Linear_Regression_Newyorkstock/prices-split-adjusted.csv")

In [3]:
train_data_frame

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.430000,125.839996,122.309998,126.250000,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.930000,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.500000,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0
...,...,...,...,...,...,...,...
851259,2016-12-30,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0
851260,2016-12-30,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0
851261,2016-12-30,ZTS,53.639999,53.529999,53.270000,53.740002,1701200.0
851262,2016-12-30,AIV,44.730000,45.450001,44.410000,45.590000,1380900.0


# Cleaning Data

In [4]:
train_data_frame.isnull().sum()

date      0
symbol    0
open      0
close     0
low       0
high      0
volume    0
dtype: int64

In [5]:
train_data_frame.drop('date' , axis = 1 , inplace = True)

In [6]:
train_data_frame.drop('symbol' , axis = 1, inplace = True)

In [7]:
train_data_frame

Unnamed: 0,open,close,low,high,volume
0,123.430000,125.839996,122.309998,126.250000,2163600.0
1,125.239998,119.980003,119.940002,125.540001,2386400.0
2,116.379997,114.949997,114.930000,119.739998,2489500.0
3,115.480003,116.620003,113.500000,117.440002,2006300.0
4,117.010002,114.970001,114.089996,117.330002,1408600.0
...,...,...,...,...,...
851259,103.309998,103.199997,102.849998,103.930000,973800.0
851260,43.070000,43.040001,42.689999,43.310001,1938100.0
851261,53.639999,53.529999,53.270000,53.740002,1701200.0
851262,44.730000,45.450001,44.410000,45.590000,1380900.0


In [8]:
cols = ['open' , 'low' , 'high' , 'volume']
train_Y = train_data_frame['close']
train_X = train_data_frame[cols]

In [9]:
print(type(train_X) , type(train_Y))
train_X = train_X.values
train_Y = train_Y.values
print(type(train_X) , type(train_Y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


# Normalizing the data

In [10]:
u = np.mean(train_X, axis = 0)
std = np.std(train_X , axis = 0)

In [11]:
train_X = (train_X - u)/std

# Training the Model

In [12]:
split = 0.8
X_train = train_X[:int(split*train_X.shape[0]),:]
Y_train = train_Y[:int(split*train_Y.shape[0])]
X_test = train_X[int(split*train_X.shape[0]):,:]
Y_test = train_Y[int(split*train_Y.shape[0]):]

In [13]:
ones = np.ones((X_train.shape[0],1))
X_train = np.hstack((ones,X_train))
ones = np.ones((X_test.shape[0],1))
X_test = np.hstack((ones,X_test))

In [14]:
print(X_train.shape, Y_train.shape,X_test.shape,Y_test.shape)

(681011, 5) (681011,) (170253, 5) (170253,)


In [15]:
def hypothesis(X , weights):
    return np.dot(X,weights)    

In [16]:
def loss(X,Y,weights):
    J = 0
    y_pred = hypothesis(X,weights)
    J = np.sum((y_pred - Y)**2)
    return J/X.shape[0]

In [17]:
def gradient(X,Y,weights):
    y_pred = hypothesis(X,weights)
    grad = np.dot(X.T , (y_pred - Y))
    return grad/X.shape[0]

In [18]:
def gradient_descent(X,Y):
    weights = np.zeros(X.shape[1],)
    list_loss = []
    learn_rate = 0.03
    epochs = 1000
    
    for i in range(epochs):
        grad = gradient(X,Y,weights)
        list_loss.append(loss(X,Y,weights))
        weights = weights - (learn_rate*grad)
    return weights,list_loss

In [19]:
weights , list_loss = gradient_descent(X_train,Y_train)

On:999

# Loss Visualization

In [20]:
list_loss[1:20]
list_loss[-20:-1]

[0.6582272886394095,
 0.6582260254016408,
 0.6582247621681359,
 0.6582234989388955,
 0.6582222357139195,
 0.658220972493208,
 0.6582197092767604,
 0.6582184460645776,
 0.6582171828566588,
 0.6582159196530044,
 0.6582146564536143,
 0.6582133932584884,
 0.6582121300676267,
 0.658210866881029,
 0.6582096036986959,
 0.6582083405206272,
 0.6582070773468224,
 0.6582058141772816,
 0.658204551012005]

# Model Testing

In [21]:
pred = []
for i in range(X_test.shape[0]):
    pred.append(hypothesis(X_test[i] , weights))

In [22]:
for i in range(Y_test.shape[0]):
    print(pred[i] , '  ' , Y_test[i])

99.65503104486734    97.75
84.90581682897057    83.040001
40.54489129744416    39.75
91.30632607140144    88.68
43.91780129707029    42.630001
71.35995486317513    69.690002
45.28136324468584    43.52
39.01194204815069    37.0
89.14528886117277    87.019997
53.814119559401554    52.099998
32.079079292492885    31.139999
134.90223488285852    132.179993
160.33227016707326    153.259995
112.46803895799043    109.980003
32.59275853586909    31.82
83.65416786212172    80.559998
96.6319399791437    94.459999
29.175266188175616    28.82
62.69758940978183    60.830002
41.30489501837645    39.759998
91.93420068330703    88.800003
70.02564299832666    66.970001
79.46224031278055    77.099998
70.57687699862471    69.129997
39.472834468002226    38.580002
59.76513510432286    57.610001
89.47937789969073    86.010002
87.81577693623119    84.800003
72.76824750940061    70.769997
67.0953947813087    65.919998
124.81448272871629    121.169998
57.22413529745207    54.959999
45.00589081386997    43.5
1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 47.450001
36.648452722945144    36.52
106.08688508660855    104.860001
27.642736904317537    27.66
271.32086648587165    269.619995
62.732795297323094    62.139999
19.861532401041284    19.639999
68.30421061509324    67.620003
73.41240545591383    72.830002
73.8224910598062    73.18
63.65072398144406    63.34
151.2753287189111    149.220001
62.07185238538982    61.849998
65.75213334466832    63.830002
108.8891862285528    108.019997
62.85755479863911    62.07
68.6582699460824    68.910004
84.02192517976137    83.209999
50.74158363956991    50.09
88.61759738813488    88.300003
71.34339808392451    70.730003
39.86736434432595    39.57
31.59503586512813    31.4
54.37952537536068    54.419998
33.82751225501112    33.560001
120.58350911967838    122.349998
244.4887388374581    239.860001
142.53619791307045    141.940002
36.60111214780646    36.189999
89.07992766338057    88.82
108.13366317789652    108.080002
33.89185371021983    33.5
76.27544357202001    76.269997
44.66471596691328    44.

# Error Estimation

In [23]:
from sklearn import metrics

In [24]:
print(f'MAE : {metrics.mean_absolute_error(Y_test,pred)}')
print(f'MSE : {metrics.mean_squared_error(Y_test,pred)}')
print(f'RMSE : {metrics.mean_squared_error(Y_test,pred)**0.5}')

MAE : 0.5760518426578495
MSE : 1.3891854461441107
RMSE : 1.1786371138497678
