In [1]:
!pip install yfinance #installs yfinance library in the colab environment



Loading main data

In [2]:
import yfinance as yf #for fetching historical stock market data
import numpy as np #for fast numerical computation
import pandas as pd #for manipulating row and column data
import tensorflow as tf #for deep learning pipelines

In [3]:
data = yf.download("GOOGL" , start = "2018-01-01" , interval = '1d')
#the first argument is a ticker that is a shorthand for a company 
#It is a unique series of letters assigned to US security for the purpose of trading.
#for microsoft it is msft for apple it is aapl.
#here we use ticker for google 

#second argument is the date from which we need the market data, we can have end argument aswell for the last date of the data.
#if left none in end then it will take one day before current date data.

#The interval argument is the frequency at which we fetch our data.
#If the interval is 1D, we get one row of data representing each day.

[*********************100%***********************]  1 of 1 completed


In [4]:
data.shape
#6 columns 1042 rows

(1042, 6)

In [5]:
data.head(3)
#the indicators that determine the highest and least values of a particular stock traded that day are high and low, respectively.

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,1053.02002,1075.97998,1053.02002,1073.209961,1073.209961,1588300
2018-01-03,1073.930054,1096.099976,1073.430054,1091.52002,1091.52002,1565900
2018-01-04,1097.089966,1104.079956,1094.26001,1095.76001,1095.76001,1302600


In [6]:
# Sort the data points based on indexes just for confirmation 
# Understanding trends in the dataset
data.sort_index(inplace = True)

In [7]:
# Remove any duplicate index 
data = data.loc[~data.index.duplicated(keep='first')]

In [8]:
#we see that the dates are till yesterday
data.tail(3)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-02-16,2732.929932,2761.719971,2698.47998,2754.76001,2754.76001,1251100
2022-02-17,2724.860107,2742.27002,2649.570068,2650.780029,2650.780029,1947500
2022-02-18,2669.340088,2681.399902,2603.929932,2608.060059,2608.060059,2277400


In [9]:
# Check for missing values 
data.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [10]:
# Get the statistics of the data
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1042.0,1042.0,1042.0,1042.0,1042.0,1042.0
mean,1607.384136,1623.706293,1590.721183,1607.717658,1607.717658,1784143.0
std,612.17258,616.460158,606.349262,611.287642,611.287642,837257.2
min,984.320007,1012.119995,977.659973,984.669983,984.669983,465600.0
25%,1146.302521,1156.570007,1135.825012,1148.272461,1148.272461,1251600.0
50%,1315.235046,1327.304993,1306.755005,1316.134949,1316.134949,1571700.0
75%,2028.695038,2048.992493,2002.450043,2021.767517,2021.767517,2057150.0
max,3025.0,3030.929932,2977.97998,2996.77002,2996.77002,6658900.0


Now, one interesting point that we see in the statistics is the difference between Q3 and Max of volume.

The difference between Q3 and Max is very high.

This means that there are a lot of spikes in volume.

Understanding Trends with in the Data

In [11]:
# Check the trend in Closing Values 
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = data['Close'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, 
                  xaxis_title='Date' , yaxis_title='Close')
fig.show()

Well, the closing value seems to be doing a lot of zigzags.

The crash we see here is due to Covid panic.

But right after that we see that the market has recovered quickly and the stock price reached all time
highs again.

In [12]:
# Check the trend in Volume Traded
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = data['Volume'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, 
                  xaxis_title='Date' , yaxis_title='Volume')
fig.show()

Next, let us visualize the volume graph.

OK, so here we see a lot of spikes as expected from the statistics of the data.

A sudden spike determines the sudden interest in stock.
It could be either in long or short.

If you keep track of the data point where there is a spike in volume, we would see a sudden bump or dump in the price of the stock value.

This is a common pattern due to panic selling or panic buying from people when the price is decreasing or increasing respectively.

The pattern in open high low would also be the same as close as an exercise.

Data Preparation

In [13]:
from sklearn.preprocessing import MinMaxScaler  #to scale the dataset
import pickle #to save and retrieve any python objects
from tqdm.notebook import tnrange #is a graphical progress bar which can be used to track the progress of our preprocessing.

In [14]:
# Filter only required data 
data = data[['Close' , 'Volume']]
data.head(3)
#Volume plays a crucial role in finding an increase or decrease in stock price.
#We will have to keep volume in our feature vector. moving with close and volume as final features.

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,1073.209961,1588300
2018-01-03,1091.52002,1565900
2018-01-04,1095.76001,1302600


Scrapping extra information

Relative Strength Index (RSI) is a momentum indicator used in technical analysis that measures the magnitude of recent 
price changes to evaluate overbought or oversold conditions in the price of a stock or other asset.

In [15]:
#the RSI trend scrapping
import requests 
#using get method and function = rsi and symbol=googl as we are doing for google, interval set to daily and key is already there
response = requests.get('https://www.alphavantage.co/query?function=RSI&symbol=GOOGL&interval=daily&time_period=5&series_type=close&apikey=43T9T17VCV2ME4SM') 
response = response.json() #converting data into json format which is a python dictionary

In [16]:
response.keys() #print keys of the response that we just scrapped

dict_keys(['Meta Data', 'Technical Analysis: RSI'])

In [17]:
#converting into pandas dataframe.
rsi_data = pd.DataFrame.from_dict(response['Technical Analysis: RSI'] , orient='index')

In [18]:
rsi_data.head() # we see the data available

Unnamed: 0,RSI
2004-08-26,72.7683
2004-08-27,64.2686
2004-08-30,47.8388
2004-08-31,49.249
2004-09-01,41.0745


In [19]:
rsi_data = rsi_data[rsi_data.index >= '2018-01-01'] #WE SEE ONLY RSI DATA AFTER 2018 as our data is from 2018 onwards

In [20]:
rsi_data['RSI'] = rsi_data['RSI'].astype(np.float64) # RSI indicator should be a floating type value, so we typecast the column to a float

In [21]:
rsi_data.head() #final RSI data that we scrapped

Unnamed: 0,RSI
2018-01-02,65.431
2018-01-03,78.3686
2018-01-04,80.483
2018-01-05,86.2429
2018-01-08,87.4882


In [22]:
data = data.merge(rsi_data, left_index=True, right_index=True, how='inner')
#We are modifying the previous data and the current RSI data into one data frame.

In [24]:
data.head() #how the final data table looks like

Unnamed: 0,Close,Volume,RSI


In [25]:
# Confirm the Testing Set length 
test_length = data[(data.index >= '2020-09-01')].shape[0]
#As we have collected past three years of data, it is better to keep at least 10 percent 
#Or three to four months of recent data as the test set in the next cell.

In [26]:
# a data frame and feature length and returns the features and targets.
def CreateFeatures_and_Targets(data, feature_length):
    X = []  #The first argument here is our feature data with columns close and volume.
    Y = []  #The second argument, feature length, is the number of past data points, the model looks to predict

#Suppose if the feature length is 30, then we are looking at the close and volume values for the past
#30 days to predict the close of the thirty first day inside the function.

##we have to empty lists X and Y, which are the features and targets respectively.

    for i in tnrange(len(data) - feature_length): #iterate on the index and select close and volume values from current index to feature length
    #We append these values to feature Vector X, we select the close of feature length as the target and append it to the target Vector Y
        X.append(data.iloc[i : i + feature_length,:].values)
        Y.append(data["Close"].values[i+feature_length])

    X = np.array(X) #We convert this lists into numbers and return them
    Y = np.array(Y)

    return X , Y

The first dimension is the number of records in our features.

Second is feature length selected.

Third is the number of columns in the filtered data.

In [27]:
#Let us now execute this function by passing feature length as 32.
X , Y = CreateFeatures_and_Targets(data , 32)
#we can try out with different feature lengths

0it [00:00, ?it/s]

In [28]:
# Check the shapes
X.shape , Y.shape

((0,), (0,))

In [29]:
# Check the shapes of our features and targets.
# we see that our features are three dimensional vectors and the target is 1D.
X.shape , Y.shape

((0,), (0,))

In [30]:
#Splitting our data into train and split testing set.
Xtrain , Xtest , Ytrain , Ytest = X[:-test_length] , X[-test_length:] , Y[:-test_length] , Y[-test_length:]

In [31]:
# Check Training Dataset Shape 
Xtrain.shape , Ytrain.shape

((0,), (0,))

In [32]:
# Check Testing Dataset Shape
Xtest.shape , Ytest.shape

((0,), (0,))

Train shape +test shape= total records

In [33]:
# Create a Scaler to Scale Vectors with Multiple Dimensions 
# We do this because our features are in three dimensions and scalers provided by sk learn only supports two dimensions
class MultiDimensionScaler():
    def __init__(self):
        self.scalers = [] #in constructor we initialise an empty list of sscalers
                          #this objects keeps track of all the scalers in the dataset 
#in the fit transform method. We are looping over the third dimension of our data and at each loop we create a new scalar and fit it over that dimension.

    def fit_transform(self , X):
        total_dims = X.shape[2]
        for i in range(total_dims):
            Scaler = MinMaxScaler()
            X[:, :, i] = Scaler.fit_transform(X[:, :, i])
            self.scalers.append(Scaler)
        return X

    def transform(self , X):  #in the transformed method.
                              #We again look over the third dimension, but here we applied are fitted scalars on this data.
        for i in range(X.shape[2]):
            X[:, :, i] = self.scalers[i].transform(X[:,:,i])
        return X 

In [34]:
Feature_Scaler = MultiDimensionScaler()
Xtrain = Feature_Scaler.fit_transform(Xtrain)
Xtest = Feature_Scaler.transform(Xtest)
#Let us now use this to scale our features here.We create an object to our class.
#Next, we call the fit transform method on Xtrain and transform on Xtest.

IndexError: ignored

In [35]:
Target_Scaler = MinMaxScaler()
Ytrain = Target_Scaler.fit_transform(Ytrain.reshape(-1,1))
Ytest = Target_Scaler.transform(Ytest.reshape(-1,1))
#Scale the targets as targets are one dimensional vectors.We convert them into two dimension 
#by using reshape (-1,1)and apply scaling similar to what we did with features.

ValueError: ignored

In [None]:
#two functions save object and load object. These functions are used to save and load python objects using Pickle.
#So let us save our feature and target scalar objects if in case we want to perform any real time predictions.

def save_object(obj , name : str):
    pickle_out = open(f"{name}.pck","wb")
    pickle.dump(obj, pickle_out)
    pickle_out.close()

def load_object(name : str):
    pickle_in = open(f"{name}.pck","rb")
    data = pickle.load(pickle_in)
    return data

In [None]:
# Save your objects for future purposes 
save_object(Feature_Scaler , "Feature_Scaler")
save_object(Target_Scaler , "Target_Scaler")

Model Building

In [None]:
#first callback is model checkpoints 
from tensorflow.keras.callbacks import ModelCheckpoint , ReduceLROnPlateau

save_best = ModelCheckpoint("best_weights.h5", monitor='val_loss', save_best_only=True, save_weights_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.25,patience=5, min_lr=0.00001,verbose = 1) #reducing the learning rate to a factor of 4 when we see no improvement in the validation loss
#The next callback is reducelearningrateonPlateau.
#This callback is used to reduce learning rate when there is no improvement in the validation loss.

In [None]:
#building deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Dropout , LSTM , Bidirectional , BatchNormalization

model = Sequential() #keeps track of all the layered objects

model.add(Bidirectional(LSTM(512 ,return_sequences=True , recurrent_dropout=0.1, input_shape=(32, 3)))) #input features is 32 initiating with bidirectional LSTM with 512 units(first layer)
model.add(LSTM(256 ,recurrent_dropout=0.1)) #dropouts in between recurrent steps
#LSTM takes sequence data as input so we set return sequence argument to True 
#this will return a 3D or sequence output from the LSTM layer.
model.add(Dropout(0.3))
model.add(Dense(64 , activation='elu')) #adding more dense layers and dropouts, elu is an activation function
model.add(Dropout(0.3)) 
model.add(Dense(32 , activation='elu')) #ELU is a modified version of leaky relu which looks like Relu and Tanh activation merged together.
model.add(Dense(1 , activation='linear')) #final layer with a neuron which outputs a continuous value 

If you are planning to use 2LSTM layers one after the other then you should set the "return sequence" argument to True in the 1st LSTM layer

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate = 0.002) #We compile our model with the optimizer set to "SGD"
model.compile(loss='mse', optimizer=optimizer) #loss here is mean squared error

The reason we chose SGD over adam is due to the problem statement and batch size.

As we are trying to predict a complex pattern, we need frequent updates, that means the batch size has to be low.

####SGD works better than adma when batch size is low

In [None]:
history = model.fit(Xtrain, Ytrain, # we apply fit function by passing training and validation sets
            epochs=10,     #model is training 
            batch_size = 1,
            verbose=1,
            shuffle=False ,
            validation_data=(Xtest , Ytest),
            callbacks=[reduce_lr , save_best]) #passing callbacks as earlier

In [None]:
# Load the best weights that were saved by model checkpoints 
model.load_weights("best_weights.h5")

Visualize prediction on Test Set

In [None]:
#We use the predict method provided by the tensorflow model to do the predictions over the test set.
Predictions = model.predict(Xtest)

In [None]:
#So let us inverse transform the predictions and actual values for comparison.
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Ytest)

In [None]:
Predictions.shape
# Here we see that inverse transformed objects have two dimensions where the second dimension is one,
# To convert them into lists. We need to remove the dimension on the first axis.

In [None]:
#Np.Squeeze is used to remove any unwanted axis with value 1
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [None]:
# Check the Predictions vs Actual
fig = go.Figure()
#plotting a line graph comparing both
fig.add_trace(go.Scatter(x = data.index[-test_length:] , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = data.index[-test_length:] , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()

###We see that the predictions on the test set are following the pattern of actual, but they are not very strongly correlated.

Visualize Prediction on whole data

In [None]:
#performing same exercise on overall dataset
Total_features = np.concatenate((Xtrain , Xtest) , axis = 0)

In [None]:
#First, we create the total features and targets by concatenating, training and testing features and targets, respectively.
Total_Targets = np.concatenate((Ytrain , Ytest) , axis = 0)

In [None]:
#Next, we do the predictions followed by inverse scale and dimension removal.
Predictions = model.predict(Total_features)

In [None]:
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Total_Targets)

In [None]:
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [None]:
# Check the trend in Volume Traded
fig = go.Figure()
#plotting line graph
fig.add_trace(go.Scatter(x = data.index , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = data.index , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()

predictions are quite smooth and they follow the structure of actual values, but they do not seem to capture the granular patterns.

### For better prediction we scrap some extra features to improve the models performance.

In [None]:
# Save and Load the whole model
model.save("Model.h5")
loaded_model = tf.keras.models.load_model("Model.h5")

Realtime Prediction

In [None]:
#we load the model that we saved
def PredictStockPrice(Model , DataFrame , PreviousDate , feature_length = 32):
    idx_location = DataFrame.index.get_loc(PreviousDate)
    Features = DataFrame.iloc[idx_location - feature_length : idx_location,:].values
    Features = np.expand_dims(Features , axis = 0)
    Features = Feature_Scaler.transform(Features)
    Prediction = Model.predict(Features)
    Prediction = Target_Scaler.inverse_transform(Prediction)
    return Prediction[0][0]

In [None]:
#REAL TIME PREDICTION USING LOADED MODEL 
PredictStockPrice(loaded_model , data , '2021-01-14')
#Next, let us try to perform Real-Time prediction using our loaded model here.
#The real time is a scenario where we have already trained our model and now we need to get predictions from our model based on customers input.
#To achieve this, we designed a function that takes in the trained model, total data frame and previous date as argument and 
#returns the next day Predictions as the output.