#STOCK PRICE PREDICTION USING LINEAR REGRESSION

##Reading the dataset

In [1]:
!pip install quandl 



In [2]:
import math
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [3]:
import quandl
df = quandl.get("EOD/AAPL", authtoken="7ro4eX9xx4GhkQoo756a")
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-09-03,493.1,500.6,487.35,488.58,11854600.0,0.0,1.0,15.61965,15.857223,15.43751,15.476472,331928800.0
2013-09-04,499.56,502.24,496.28,498.691,12322600.0,0.0,1.0,15.82428,15.909173,15.720381,15.796753,345032800.0
2013-09-05,500.25,500.68,493.64,495.27,8441700.0,0.0,1.0,15.846136,15.859757,15.636755,15.688388,236367600.0
2013-09-06,498.44,499.38,489.95,498.22,12840200.0,0.0,1.0,15.788802,15.818578,15.519869,15.781833,359525600.0
2013-09-09,505.0,507.92,503.48,506.17,12167400.0,0.0,1.0,15.9966,16.089095,15.948451,16.033661,340687200.0


## Data Analysis

Calculating the High Low Percentage and Percentage Change -

In [4]:
df = df[['Adj_Open',  'Adj_High',  'Adj_Low',  'Adj_Close', 'Adj_Volume']]

In [5]:
df['HL_PCT'] = (df['Adj_High'] - df['Adj_Low']) / df['Adj_Close'] * 100.0

In [6]:
df['PCT_change'] = (df['Adj_Close'] - df['Adj_Open']) / df['Adj_Open'] * 100.0

In [7]:
df = df[['Adj_Close', 'HL_PCT', 'PCT_change', 'Adj_Volume']]
df.head()

Unnamed: 0_level_0,Adj_Close,HL_PCT,PCT_change,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-09-03,15.476472,2.711941,-0.91665,331928800.0
2013-09-04,15.796753,1.195129,-0.173953,345032800.0
2013-09-05,15.688388,1.421447,-0.995502,236367600.0
2013-09-06,15.781833,1.892738,-0.044138,359525600.0
2013-09-09,16.033661,0.877176,0.231683,340687200.0


## Data Preprocessing

The Nan values are replaced with -99999 as the machine learning classifier cannot accept Nan data points and the replaced value will then be considered as an outlier

In [8]:
forecast_col = 'Adj_Close'
df.fillna(value=-99999, inplace=True)

Here, the features are the current values, and the label will be the price, so we have considered the future to be 1% of the entire length of the dataset out.

In [9]:
forecast_out = int(math.ceil(0.01 * len(df)))

In [10]:
df['label'] = df[forecast_col].shift(-forecast_out) #the price in future

Dropping the rows with NaN values

In [11]:
df.dropna(inplace=True)

In [12]:
X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])

Scaling the features

In [13]:
X = preprocessing.scale(X)

Target variable is y i.e. the price in future.

In [14]:
y = np.array(df['label'])

##Training with Linear Regression

In [15]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
model=LinearRegression()

In [17]:
model.fit(train_X,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
val_predictions = model.predict(test_X)
#val_predictions

##Model Evaluation

In [19]:
print("Mean Squared Error: ")
print(mean_squared_error(test_y, val_predictions))

Mean Squared Error: 
1.5674780454025352


Thus, the MSE of the model is 1.57 (approx.).

In [20]:
print("Mean Absolute Error: ")
print(mean_absolute_error(test_y, val_predictions))

Mean Absolute Error: 
0.9494078395192512


Thus, the MAE of the model is approximately 0.95 .

The model is good as the error is quite low.