In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime 
from datetime import date
import math
import pandas_datareader as web
import os

In [80]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px

In [81]:
df = pd.read_csv('NFLX.csv')
df = df.sort_index(ascending=True, axis=0)
df.head()



Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200


In [82]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5044 entries, 0 to 5043
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       5044 non-null   object 
 1   Open       5044 non-null   float64
 2   High       5044 non-null   float64
 3   Low        5044 non-null   float64
 4   Close      5044 non-null   float64
 5   Adj Close  5044 non-null   float64
 6   Volume     5044 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 276.0+ KB


In [83]:
df.columns


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [84]:
df.describe()


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,5044.0,5044.0,5044.0,5044.0,5044.0,5044.0
mean,116.467981,118.237392,114.578719,116.456338,116.456338,16530680.0
std,168.740876,171.140864,166.119837,168.668232,168.668232,19244870.0
min,0.377857,0.410714,0.346429,0.372857,0.372857,285600.0
25%,3.955357,4.031072,3.885357,3.958571,3.958571,6168225.0
50%,25.819285,26.342143,25.455,25.802856,25.802856,10591350.0
75%,155.762497,157.654998,153.944996,155.869999,155.869999,19865520.0
max,692.349976,700.98999,686.090027,691.690002,691.690002,323414000.0


In [85]:
df.shape

(5044, 7)

In [86]:
df.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

# Time series graph of data

In [87]:
fig = px.line(df, x='Date', y='Close')
fig.show()

In [88]:
# Taking diff indicators for prediction
# ohlc_avg is the average of open, high, low, close values
# hlc_avgs is the average of high, low, close value
# we will take only ohlc_avg data only in whole nb 
ohlc_data = df.iloc[:, 1:5]
ohlc_avg = ohlc_data.mean(axis=1)
hlc_avg = df[['High', 'Low', 'Close']].mean(axis=1)
close = df.Close

In [89]:
fig1 = go.Figure()

fig1.add_trace(go.Scatter(x = df.index, y = ohlc_avg,
                  name='OHLC avg'))
fig1.add_trace(go.Scatter(x = df.index, y = hlc_avg,
                  name='HLC avg'))
fig1.add_trace(go.Scatter(x = df.index, y = close,
                  name='close column data'))
fig1.show()
     

In [119]:
if not os.path.exists("images"):
    os.mkdir("images")
     

In [120]:
!pip install -U kaleido




In [94]:
new_data = pd.DataFrame(index=range(0,len(df)), columns=['Date', 'ohlc_avg'])
for i in range(0, len(df)):
  new_data['Date'][i] = df['Date'][i]
  new_data['ohlc_avg'][i] = ohlc_avg[i]

In [96]:
new_data.head()

Unnamed: 0,Date,ohlc_avg
0,2002-05-23,1.185357
1,2002-05-24,1.211607
2,2002-05-28,1.19
3,2002-05-29,1.129464
4,2002-05-30,1.089643


In [97]:
new_data.index = new_data.Date
new_data.drop('Date', axis=1, inplace=True)

In [98]:
print(len(new_data))

5044


In [99]:
ds = new_data.values

In [100]:
train = int(len(new_data)*0.8)
test = len(new_data) - train
train, test = new_data.iloc[0:train,:], new_data.iloc[train:len(new_data),:]

In [101]:
train.shape

(4035, 1)

In [102]:
test.shape

(1009, 1)

# Normalizing data

In [103]:
# we have normalize the data cuz data is like 149...., 488..something like that
# so we have to normalize betwwen 0 and 1
scalar = MinMaxScaler(feature_range=(0, 1))
scaled_data = scalar.fit_transform(ds)

# splitting the data into x_train, y_train

In [104]:
# splitting the data to x_train, y_train
# we will first train upto 60 and then predict on 61 and then 
# we will train from 61 to 120 then predict on 121 likewise we will go
x_train, y_train = [], []
for i in range(60, len(train)):
  x_train.append(scaled_data[i-60:i,0])
  y_train.append(scaled_data[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)

In [105]:
# now we have reshape the array to 3-d to pass the data into lstm [number of samples, time steps/batch_size, features] 
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

In [106]:
x_train.shape

(3975, 60, 1)

# Modelling

In [108]:
from keras.layers import Activation
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation

In [109]:
# create and fit the lstm network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.25))
model.add(LSTM(units=50))
model.add(Dense(1))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [110]:
model.fit(x_train, y_train, epochs=50, batch_size=32, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x288ee22a150>

# Prediction

In [111]:
# predicting 920 values, using past 60 from the train data
inputs = new_data[len(new_data)-len(test) - 60:].values
inputs = inputs.reshape(-1,1)
inputs = scalar.transform(inputs)
     

In [112]:
inputs.shape

(1069, 1)

In [113]:
x_test = []
for i in range(60,inputs.shape[0]):
    x_test.append(inputs[i-60:i,0])
x_test = np.array(x_test)
     

In [114]:
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [115]:
predicted_price = model.predict(x_test)
# inverse transform for getting back all normal values from scaled values
predicted_price = scalar.inverse_transform(predicted_price)
     



In [116]:
rms=np.sqrt(np.mean(np.power((test-predicted_price),2)))
rms

19.114745452742014

In [117]:
# create a new column of predicted values
test['Prediction'] = predicted_price
test.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,ohlc_avg,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-06-04,360.75,359.396851
2018-06-05,365.089996,361.685974
2018-06-06,367.059998,365.425385
2018-06-07,364.110001,369.048676
2018-06-08,359.317505,370.066101


In [118]:
# Graph for comparing the results of model predicted and original value
fig2 = go.Figure()

fig2.add_trace(go.Scatter(x = train.index, y = train.ohlc_avg,
                  name='train'))
fig2.add_trace(go.Scatter(x = test.index, y = test.ohlc_avg,
                  name='test_ohlc_avg'))
fig2.add_trace(go.Scatter(x = test.index, y = test.Prediction,
                  name='test'))
fig2.show()

In [122]:
fig3 = px.line(df, x='Date', y='Close')
fig3.show()

As we can see from the above graph that there is very less rms loss is there between predicted and original data