# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import torch as t
import keras
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import MinMaxScaler


## Importing data

In [2]:
stock_data = pd.read_csv('https://raw.githubusercontent.com/Varun2063/stock-price-prediction/main/ABT.csv')
stock_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
10093,2020-03-26,71.080002,75.970001,71.0,75.809998,75.809998,10530500
10094,2020-03-27,77.5,77.5,73.610001,74.559998,74.559998,8360600
10095,2020-03-30,82.830002,84.400002,78.470001,79.339996,79.339996,47540200
10096,2020-03-31,81.43,81.489998,78.559998,78.910004,78.910004,23792300
10097,2020-04-01,77.0,77.379997,75.349998,76.57,76.57,12198700


# Data Preprocessing

In [3]:
stock_data.Date = pd.to_datetime(stock_data["Date"])

In [4]:
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-03-17,0.0,0.512028,0.497999,0.501506,0.009856,7513300
1,1980-03-18,0.0,0.512028,0.494492,0.505013,0.009925,5303400
2,1980-03-19,0.0,0.515535,0.50326,0.512028,0.010062,2523300
3,1980-03-20,0.0,0.513781,0.499753,0.499753,0.009821,4654800
4,1980-03-21,0.0,0.506767,0.499753,0.505013,0.009925,1332800


In [5]:
print((stock_data.Date[len(stock_data)-1] - stock_data.Date[0]).days/365.25,"years")

40.04106776180698 years


we have **40 years** worth of data but we are only using **last 10 years** of data to use in the LSTM model.

In [6]:
tenyrs = stock_data['Date'].max() - pd.DateOffset(years=10)

stock_data_1 = stock_data[stock_data['Date'] >= tenyrs]

In [7]:
stock_data_1.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
7580,2010-04-01,25.333441,25.415007,25.227884,25.410208,17.999283,12903500
7581,2010-04-05,25.424603,25.549351,25.28546,25.395815,17.989082,11130400
7582,2010-04-06,25.367025,25.486977,25.227884,25.352633,17.958494,15102300
7583,2010-04-07,25.28546,25.395815,25.131924,25.189501,17.842943,20577100
7584,2010-04-08,25.232681,25.280663,25.045561,25.083944,17.768171,15111200


In [8]:
print("Distribution of OPEN values in STOCK DATA")
fig = px.histogram(stock_data_1 , x = "Open")
fig.show()

Distribution of OPEN values in STOCK DATA


In [9]:
print("Distribution of CLOSE values in STOCK DATA")
fig = px.histogram(stock_data_1, x = "Close")
fig.show()

Distribution of CLOSE values in STOCK DATA


We have noticed that the data doesnot follow a normal distribution. So we are going to transform it into a normal distribution with min-max transform.

In [10]:
print("data before transformation")
fig = px.line(stock_data_1, x=stock_data_1.columns[0], y=stock_data_1.columns[1:6])
fig.show()

data before transformation


**Scaling Data**

In [11]:
data_2 = stock_data_1[[ 'Open']]
scaler = MinMaxScaler(feature_range=(0,1))
data_2 = scaler.fit_transform(data_2)
data_2 = pd.DataFrame(data_2, columns = ['Open'], index = [i for i in range(7580, 7580+len(stock_data_1))])
data_2.insert(0, "Date", stock_data_1['Date'])

In [12]:
data_2.head()

Unnamed: 0,Date,Open
7580,2010-04-01,0.052131
7581,2010-04-05,0.053431
7582,2010-04-06,0.05261
7583,2010-04-07,0.051447
7584,2010-04-08,0.050694


In [13]:
fig = px.histogram(data_2, x = "Open")
fig.show()

In [14]:
print("data after transformation")
fig = px.line(data_2, x=data_2.columns[0], y=data_2.columns[1:6])
fig.show()

data after transformation


# Splitting the data

We are splitting the data into 3 parts  ( TRAINING, VALIDATION, TESTING )

In [15]:
len(data_2)

2518

In [16]:
data_3 = data_2.iloc[:,1:2]
data_3.head()

Unnamed: 0,Open
7580,0.052131
7581,0.053431
7582,0.05261
7583,0.051447
7584,0.050694


In [17]:
train = data_3[:round(len(data_3)*0.80)].values
test  = data_3[round(len(data_3)*0.80):].values

In [18]:
len(train)

2014

In [19]:
len(test)

504

In [20]:
X_train = []
y_train = []
for i in range(60, len(train)):
    X_train.append(train[i-60:i, 0])
    y_train.append(train[i, 0])

X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

In [21]:
y_train.shape

(1954,)

In [22]:
X_test = []
y_test = []
for i in range(60, len(test)):
    X_test.append(test[i-60:i, 0])
    y_test.append(test[i, 0])

X_test, y_test = np.array(X_test), np.array(y_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [23]:
X_test.shape

(444, 60, 1)

In [24]:
y_test.shape

(444,)

# Modelling the lstm model


we first use the simple **vanilla lstm model** with a dropout layer

In [25]:
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential

# Modelling the network

# Model 1 ( loss = 4.0271 )

In [26]:
model = keras.models.Sequential()
model.add(LSTM(50, input_shape = (X_train.shape[1],1)))
model.add(Dropout(0.1))
model.add(Dense(units = 1))
model.compile( optimizer = "adam", loss = "mean_squared_error")

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 50)                10400     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 10451 (40.82 KB)
Trainable params: 10451 (40.82 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
history = model.fit(X_train, y_train, epochs =50, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
y_test = np.reshape(y_test,(-1,1))
real_stock_price = scaler.inverse_transform(y_test)

In [30]:
pre_price = model.predict(X_test)
pre_price = scaler.inverse_transform(pre_price)
plot_df = pd.DataFrame(pre_price,columns = ["predicted"])
plot_df.insert(0, "real", real_stock_price)



In [31]:
fig = px.line(plot_df, y = plot_df.columns[:2])
fig.show()

In [32]:
mse(plot_df["real"], plot_df["predicted"])

3.1389841290381497

In [33]:
loss_value = history.history['loss']
epoch = range(1, len(loss_value)+1)
px.line(x = epoch, y = loss_value, labels = {'x':'epochs',"y":"loss_value"}, title = "training loss")

the MSE during training was **loss: 0.00017439** but the MSE during testing is 4.02711 as found. This shows that the model has been **overfitted severely**. now we will try to overcome this fitting by


*   reducing the number of epochs
*   implementing more drouput



# model 2 ( loss = 6.3589 )

In [34]:
model1 = keras.models.Sequential()
model1.add(LSTM(30, input_shape = (X_train.shape[1],1)))
model1.add(Dropout(0.2))
model1.add(Dense(units = 1))
model1.compile( optimizer = "adam", loss = "mean_squared_error")

In [35]:
history = model1.fit(X_train, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
predicted_1 = model1.predict(X_test)
plot_df["predicted_1"] = scaler.inverse_transform(predicted_1)
plot_df



Unnamed: 0,real,predicted,predicted_1
0,61.209999,61.282494,61.136673
1,60.459999,61.047230,60.924629
2,61.490002,60.702251,60.691616
3,60.630001,60.711048,60.480301
4,60.970001,60.587082,60.281643
...,...,...,...
439,71.080002,68.429031,70.260132
440,77.500000,68.138535,69.532730
441,82.830002,70.320663,69.222221
442,81.430000,74.443733,69.521194


In [37]:
fig = px.line(plot_df, y = plot_df.columns)
fig.show()

In [38]:
mse(plot_df["real"], plot_df["predicted_1"])

14.812490351431103

simplifying the model has made the output more deviant from the target output. so we are going to add a few more layers to test it out.

# Model 3 ( loss = 2.9866 )

In [39]:
model_3 = Sequential()
model_3.add(LSTM(50, return_sequences = True, input_shape = (X_train.shape[1],1)))
model_3.add(Dropout(0.1))
model_3.add(LSTM(50, return_sequences = True))
model_3.add(Dropout(0.2))
model_3.add(LSTM(30))
model_3.add(Dropout(0.2))
model_3.add(Dense(units = 1))
model_3.compile(optimizer = "adam", loss = "mean_squared_error")

In [40]:
history = model_3.fit(X_train, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
predicted_2 = model.predict(X_test)
predicted_2 = scaler.inverse_transform(predicted_2)
plot_df["predicted_2"] = predicted_2



In [42]:
fig = px.line( y=plot_df['real'], labels={'x': 'date', 'y': 'price'}, title='predicted vs target')
fig.update_traces(line=dict(color='blue'), name='terget')
fig.add_scatter(y=plot_df['predicted_2'], mode='lines', line=dict(color='red'), name='predicted')
fig.show()

In [43]:
mse(plot_df['real'], plot_df['predicted_2'])

3.1389841290381497

# Model 4 ( loss = 19.3456 )

In [44]:
model_4 = Sequential()
model_4.add(LSTM(units=30,return_sequences=True,input_shape=(X_train.shape[1], 1)))
model_4.add(Dropout(0.1))
model_4.add(LSTM(units=20,return_sequences=True))
model_4.add(Dropout(0.1))
model_4.add(LSTM(units=10,return_sequences=True))
model_4.add(Dropout(0.1))
model_4.add(LSTM(units=10))
model_4.add(Dropout(0.1))
model_4.add(Dense(units=1))
model_4.compile(optimizer='adam',loss='mean_squared_error')

model_4.fit(X_train,y_train,epochs=10,batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a6cec210fa0>

In [45]:
predicted_3 = model_3.predict(X_test)
predicted_3 = scaler.inverse_transform(predicted_3)
plot_df["predicted_3"] = predicted_3



In [46]:
fig = px.line( y=plot_df['real'], labels={'x': 'date', 'y': 'price'}, title='predicted vs target')
fig.update_traces(line=dict(color='blue'), name='terget')
fig.add_scatter(y=plot_df['predicted_3'], mode='lines', line=dict(color='red'), name='predicted')
fig.show()

In [47]:
mse(plot_df['real'], plot_df['predicted_3'])

6.193427947574418

# Outcomes:



*   **Out of the 4 models developed we see that the "model 3" has the lowest loss value.**
*   **we see that the 4th model has only captured the general trend than giving the specific outputs.**

