In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

* The following kernel is based on the dataset available from [data](https://www.kaggle.com/rohanrao/nifty50-stock-market-data)
* This dataset is an extensive collection of Stock Market data of Nifty Stocks aggregated from (2000 - 2019) [Nifty 50](https://en.wikipedia.org/wiki/NIFTY_50)  
* The NIFTY 50 index National Stock Exchange of India's benchmark broad based stock market index for the Indian equity market. Full form of NIFTY is National Index Fifty.


***What is the purpose of this Kernel***

* This kernel is an attempt to perform analysis on how the stock data has been behaving over the years across various stocks
* Also we will look at few visualizations which will help us viewing the data as a graph instead of as codes

In [None]:
!pip install pmdarima
#Make sure you have enabled internet while running this inside Kaggle Kernel

In [None]:
##Importing the most frequent libraries used
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np  # linear algebra
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
#setting figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10

In [None]:
#for normalizing data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

**Understanding the columns available in the dataset**

* Date - The date at which the data is reccorded, typically wont have Weekends and National/Public holidays
* Symbol - The short text by which the Company/Stock is identified in Nifty (basically kind of primary key)
* Series - This indicates which series it belongs to [Series](https://help.tradesmartonline.in/what-does-eq-and-be-series-stand-for-in-nse/)
* Prev Close - The Closing Price on the preceeding time period
* Open - The opening price on the stock 
* High - The highest price of the stock on that particular time period 
* Low - The lowest price of the stock on that particular time period
* Last - [What is Last](https://www.sapling.com/8101485/last-mean-stocks)
* Close - The closing price of the stock
* Vwap - The volume weighted average price (VWAP) is a trading benchmark used by traders that gives the average price a security has traded at throughout the day, based on both volume and price. [What is Vwap](https://www.investopedia.com/terms/v/vwap.asp)
* Volume - Volume is the number of shares or contracts traded in a security or an entire market during a given period of time. [What is Volume](https://www.investopedia.com/terms/v/volume.asp)
* Turnover - Share turnover is a measure of stock liquidity calculated by dividing the total number of shares traded over a period by the average number of shares outstanding for the period [What is Turnover](https://www.investopedia.com/terms/s/shareturnover.asp)
* Trades - Trade in stock markets means the transfer (in exchange for money) 
* Delieverable Volume - 
* % Delivarable - 

1. (**Please let me know if some columns have been misunderstood, and all these data have been aggregated from Internet as I am not completely aware of the Stocks world**)

***How to analyse this data ?***

As seen from the data we know that there is a time dependent column and hence this is best suited for Time Series analysis, which means in our analysis time period will be the primary point in the analysis

Inspiration for this kernel (https://www.kaggle.com/rohanrao/a-modern-time-series-tutorial) and (https://www.kaggle.com/parulpandey/getting-started-with-time-series-using-pandas)

### **Data Preparation**

In [None]:
#We will use one dataset from the above list to perform our analysis (Maruti) 
data = pd.read_csv("/kaggle/input/nifty50-stock-market-data/MARUTI.csv")
data.head()

In [None]:
#We will creat a empty dataframe to store all our prediction results
plot_df = pd.DataFrame()

In [None]:
data.shape
#4098 rows and 15 columns

In [None]:
# This will show us the what are the data columns and its data type available for analysis
data.info()

In [None]:
#Not much, but still we seem to have null values as shown below
data.isnull().sum()

We can take the Vwap as our target variable 

In [None]:
#We will drop this column as we are not going to use this and it has considerable amount of null values 
#We will also drop the null values
data.drop(['Trades'], axis=1,inplace = True)
data.dropna(inplace=True)

In [None]:
#We will set the Index to the date column availabe as it will be best suited in this secnario
data.set_index("Date", drop=False, inplace=True)
data.head()

In [None]:
data.shape

* Throught this kernel we will use yticks as (100,10000,1000) which gives us a common frequency of 1000 and we will compare each plots arrived

In [None]:
data.VWAP.plot()
#Shows as increasing trend over the time
plt.yticks(np.arange(100, 10000, 1000))

In [None]:
data[['Open','Close','VWAP','High','Low']].plot()
plt.yticks(np.arange(100, 10000, 1000))

In [None]:
#Lets visualize the correlation among the data
corr = data.corr()
sns.heatmap(corr)

In [None]:
data.Date = pd.to_datetime(data.Date, format="%Y-%m-%d")
data["month"] = data.Date.dt.month
data["week"] = data.Date.dt.week
data["day"] = data.Date.dt.day
data["day_of_week"] = data.Date.dt.dayofweek
data.head()

In [None]:
#Split is not random, as we are dependent on time for the analysis
data_train = data[data.Date < "2019"]
data_valid = data[data.Date >= "2019"]

# **ARIMA**
https://www.analyticsvidhya.com/blog/2018/08/auto-arima-time-series-modeling-python-r/

In [None]:
from pmdarima import auto_arima

model_ARIMA = auto_arima(data_train.VWAP,trace=True, start_p=1, start_q=1,max_p=3, max_q=3, 
                   m=12,start_P=0, seasonal=True,d=1, D=1,error_action='ignore',suppress_warnings=True)
model_ARIMA.fit(data_train.VWAP)

forecast_ARIMA = model_ARIMA.predict(n_periods=len(data_valid))


In [None]:
plot_df['VWAP'] = data_valid['VWAP']
plot_df['Forecast_ARIMAX'] = forecast_ARIMA
plot_df[["VWAP", "Forecast_ARIMAX"]].plot()
plt.yticks(np.arange(100, 10000, 1000))

# **KNN**
https://www.analyticsvidhya.com/blog/2018/08/k-nearest-neighbor-introduction-regression-python/

In [None]:
#importing libraries
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
x_train = data_train.drop(['VWAP','Date','Symbol','Series'], axis=1)
y_train = data_train['VWAP']
x_valid = data_valid.drop(['VWAP','Date','Symbol','Series'], axis=1)
y_valid = data_valid['VWAP']

In [None]:
#scaling data
x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled)
x_valid_scaled = scaler.fit_transform(x_valid)
x_valid = pd.DataFrame(x_valid_scaled)

#using gridsearch to find the best parameter
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsRegressor()
model_knn = GridSearchCV(knn, params, cv=5)

#fit the model and make predictions
model_knn.fit(x_train,y_train)
forecast_knn = model_knn.predict(x_valid)

In [None]:
plot_df["Forecast_KNN"] = forecast_knn
plot_df[["VWAP", "Forecast_KNN"]].plot()
plt.yticks(np.arange(100, 10000, 1000))

# **Linear Regression**
https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/

In [None]:
#implement linear regression
from sklearn.linear_model import LinearRegression
model_lin = LinearRegression()
model_lin.fit(x_train,y_train)

In [None]:
preds_lin = model_lin.predict(x_valid)

In [None]:
plot_df["Forecast_lin"] = preds_lin
plot_df[["VWAP", "Forecast_lin"]].plot()

# **Prophet**
https://facebook.github.io/prophet/docs/quick_start.html#python-api

In [None]:
from fbprophet import Prophet

In [None]:
#We will use one dataset from the above list to perform our analysis (Maruti) 
data_prophet = pd.read_csv("/kaggle/input/nifty50-stock-market-data/MARUTI.csv")
data_prophet.head()

In [None]:
data_train_p = data_prophet[data_prophet.Date < "2019"]
data_valid_p = data_prophet[data_prophet.Date >= "2019"]

In [None]:
#fit the model
model_fbp = Prophet()
model_fbp.fit(data_train_p[["Date", "VWAP"]].rename(columns={"Date": "ds", "VWAP": "y"}))

In [None]:
forecast_prophet = model_fbp.predict(data_valid_p[["Date", "VWAP"]].rename(columns={"Date": "ds"}))
preds_prophet = forecast_prophet.yhat.values

In [None]:
plot_df["Forecast_prophet"] = preds_prophet
plot_df[["VWAP", "Forecast_prophet"]].plot()
plt.yticks(np.arange(100, 10000, 1000))

In [None]:
new_data = pd.read_csv("/kaggle/input/nifty50-stock-market-data/MARUTI.csv")
new_data.head()

* Below feature engineering I have learned from the kernel (https://www.kaggle.com/rohanrao/a-modern-time-series-tutorial)
* Credits to : Vopani

In [None]:
new_data.reset_index(drop=True, inplace=True)
lag_features = ["High", "Low", "Volume", "Turnover", "Trades"]
window1 = 3
window2 = 7
window3 = 30

new_data_rolled_3d = new_data[lag_features].rolling(window=window1, min_periods=0)
new_data_rolled_7d = new_data[lag_features].rolling(window=window2, min_periods=0)
new_data_rolled_30d = new_data[lag_features].rolling(window=window3, min_periods=0)

new_data_mean_3d = new_data_rolled_3d.mean().shift(1).reset_index().astype(np.float32)
new_data_mean_7d = new_data_rolled_7d.mean().shift(1).reset_index().astype(np.float32)
new_data_mean_30d = new_data_rolled_30d.mean().shift(1).reset_index().astype(np.float32)

new_data_std_3d = new_data_rolled_3d.std().shift(1).reset_index().astype(np.float32)
new_data_std_7d = new_data_rolled_7d.std().shift(1).reset_index().astype(np.float32)
new_data_std_30d = new_data_rolled_30d.std().shift(1).reset_index().astype(np.float32)

for feature in lag_features:
    new_data[f"{feature}_mean_lag{window1}"] = new_data_mean_3d[feature]
    new_data[f"{feature}_mean_lag{window2}"] = new_data_mean_7d[feature]
    new_data[f"{feature}_mean_lag{window3}"] = new_data_mean_30d[feature]
    
    new_data[f"{feature}_std_lag{window1}"] = new_data_std_3d[feature]
    new_data[f"{feature}_std_lag{window2}"] = new_data_std_7d[feature]
    new_data[f"{feature}_std_lag{window3}"] = new_data_std_30d[feature]

new_data.fillna(new_data.mean(), inplace=True)

new_data.set_index("Date", drop=False, inplace=True)
new_data.head()

In [None]:
new_data.Date = pd.to_datetime(new_data.Date, format="%Y-%m-%d")
new_data["month"] = new_data.Date.dt.month
new_data["week"] = new_data.Date.dt.week
new_data["day"] = new_data.Date.dt.day
new_data["day_of_week"] = new_data.Date.dt.dayofweek
new_data.head()

In [None]:
new_data_train = new_data[new_data.Date < "2019"]
new_data_valid = new_data[new_data.Date >= "2019"]

exogenous_features = ["High_mean_lag3", "High_std_lag3", "Low_mean_lag3", "Low_std_lag3",
                      "Volume_mean_lag3", "Volume_std_lag3", "Turnover_mean_lag3",
                      "Turnover_std_lag3", "Trades_mean_lag3", "Trades_std_lag3",
                      "High_mean_lag7", "High_std_lag7", "Low_mean_lag7", "Low_std_lag7",
                      "Volume_mean_lag7", "Volume_std_lag7", "Turnover_mean_lag7",
                      "Turnover_std_lag7", "Trades_mean_lag7", "Trades_std_lag7",
                      "High_mean_lag30", "High_std_lag30", "Low_mean_lag30", "Low_std_lag30",
                      "Volume_mean_lag30", "Volume_std_lag30", "Turnover_mean_lag30",
                      "Turnover_std_lag30", "Trades_mean_lag30", "Trades_std_lag30",
                      "month", "week", "day", "day_of_week"]

In [None]:
model_fbp_features = Prophet()
for feature in exogenous_features:
    model_fbp_features.add_regressor(feature)

model_fbp_features.fit(new_data_train[["Date", "VWAP"] + exogenous_features].rename(columns={"Date": "ds", "VWAP": "y"}))

forecast_prophet_features = model_fbp_features.predict(new_data_valid[["Date", "VWAP"] + exogenous_features].rename(columns={"Date": "ds"}))
plot_df["Forecast_Prophet_features"] = forecast_prophet_features.yhat.values

In [None]:
plot_df[["VWAP","Forecast_Prophet_features"]].plot()
plt.yticks(np.arange(100, 10000, 1000))

In [None]:
#Overall Comparision of various timeseries models
plot_df[["VWAP","Forecast_ARIMAX" , "Forecast_KNN" , "Forecast_lin" , "Forecast_prophet" , "Forecast_Prophet_features"]].plot()
plt.yticks(np.arange(100, 10000, 1000))

#Learned from Kernels 
* https://www.kaggle.com/rohanrao/a-modern-time-series-tutorial
* Blog : https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/
* Like this Kernel ? Please help me in getting motivated by upvoting
* Have a suggestion ? or found a mistake in code or in any of the terms ? or have a better way to improve this ?? 
* Please let me know in the comments