In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction 
The purpose of this notebook is to create models to predict Amazon stock prices up to 20 business days (~1 month) into the future. 

The data was pulled from Yahoo Finance, and all AMZN stock prices are relative to the June 2022 20:1 stock split. Data source: https://ca.finance.yahoo.com/quote/AMZN/history?p=AMZN


In this notebook, I build two models and compare their accuracy:
- Rolling Regressive XGBoost model 
- Facebook Prophet model'

As you will see below, the rolling regressive model will have a better outcome. However, in this case, Prophet is being used in a limited implementation to create a fair comparison (to 120 train days in the XGBoost model).

### Loading Data and Set-Up

In [None]:
data = pd.read_csv('/kaggle/input/amazon-stock-prices-may-2017-to-may-2022/AMZN.csv')

In [None]:
data.head()

In [None]:
data.describe()

## Feature Engineering
- Adding date and time based features
- Adding rolling mean averages for close, high, low, volume
- Adding lag features for close 
- Removing columns from data set to avoid leakage - the purpose of the model is to predict the given day's stock close, and so data from that day (High, Low, etc.) would not be available yet for predictions. 

In [None]:

# adding date based features: day of week, day of month, day of year 

data['Date'] = pd.to_datetime(data['Date'])
data['Day of week'] = data['Date'].dt.weekday
data['Day of month'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# rolling averages for days = 7, 30, 90, 180, 365

data["7d Mean Close"] = data['Close'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Close"] = data['Close'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Close"] = data['Close'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Close"] = data['Close'].rolling(window = 120).mean() #120 business days = 6 mo

# adding lag features: Close on (date - 1), (date-2), 3,4,5,7,14,21,30,60,90,180, 365
data['Close-1'] = data['Close'].shift(periods=1)
data['Close-2'] = data['Close'].shift(periods=2)
data['Close-3'] = data['Close'].shift(periods=3)
data['Close-4'] = data['Close'].shift(periods=4)
data['Close-5'] = data['Close'].shift(periods=5)
data['Close-7'] = data['Close'].shift(periods=7)
data['Close-14'] = data['Close'].shift(periods=14)
data['Close-21'] = data['Close'].shift(periods=21)
data['Close-30'] = data['Close'].shift(periods=30)
data['Close-60'] = data['Close'].shift(periods=60)
data['Close-90'] = data['Close'].shift(periods=90)

# Adding rolling averages for high, low and volume
data["7d Mean High"] = data['High'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean High"] = data['High'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean High"] = data['High'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean High"] = data['High'].rolling(window = 120).mean() #120 business days = 6 mo

data["7d Mean Low"] = data['Low'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Low"] = data['Low'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Low"] = data['Low'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Low"] = data['Low'].rolling(window = 120).mean() #120 business days = 6 mo

data["7d Mean Volume"] = data['Volume'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Volume"] = data['Volume'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Volume"] = data['Volume'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Volume"] = data['Volume'].rolling(window = 120).mean() #120 business days = 6 mo


# Will only train models on data with all features intact, so will exclude the first 120 data points
full_data = data.drop(index = range(119)) 
full_data = full_data.reset_index(drop=True)

#### Dropping Features to Limit Data Leakage

In [None]:
# To avoid leakage must drop features that will be unavailable at time of prediction
X = full_data.drop(columns = ['Close', 'Adj Close', 'Open', 'High', 'Low', 'Volume']) 
Y = full_data['Close']
# Adding index in place of date
X ['index'] = X.index
# Removing dates so XGBoost will work
X_no_dates = X.drop(columns = ['Date']).to_numpy()
Y_no_dates = Y.drop(columns = ['Date']).to_numpy()


## XG Boost Approach

#### Creating a simple XGBoost Regression Model 
This will have a high error value but will serve as a comparison for the cross-validated regression and prophet models

In [None]:
## Creating a manual split (must allocate indexes manually to prevent shuffling)
X_train = X_no_dates[0:800]
X_test = X_no_dates[800:1139]
Y_train = Y_no_dates[0:800]
Y_test = Y_no_dates[800:1139]


In [None]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
model = XGBRegressor()
model.fit(X_train, Y_train)
preds = model.predict(X_test)

mse_xgboost = mean_squared_error (preds, Y_test)
print(mse_xgboost)


### Adding Cross Validation using a Rolling 120-day Window 
Using an rolling window to train the model. The length of the training series remains constant, with each subsequent fold retaining the 120 business day up to that point. The testing series remains the same length throughout, predicting 20 business days into the future. 

In [None]:
from sklearn.metrics import mean_squared_error
from statistics import mean

i = 0
all_mse = []

for train_index, test_index in tscv.split(X):
    while i+140 < len(X):
        X_train, X_test = X_no_dates[i:i+120], X_no_dates[i+120:i+140]
        Y_train, Y_test = Y_no_dates[i:i+120], Y_no_dates[i+120:i+140]
        model = XGBRegressor()
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        mse = mean_squared_error (Y_test, preds)
        all_mse.append(mse)
        i = i + 20

print("Mean Squared Error for Rolling XGBoost: ", mean(all_mse))
    






# Facebook Prophet Approach 

In [None]:
!pip install prophet;

In [None]:
from prophet import Prophet
# dropping cols to avoid leakage
full_data_prophet = full_data.drop(columns = ['Adj Close']) 
full_data_prophet.rename(columns={'Date':'ds','Close':'y'},inplace=True)
full_data_prophet['ds'] = pd.to_datetime(full_data_prophet['ds'])


In [None]:
# we will use a rolling 120 day prophet model to predict the following day 
from sklearn.metrics import mean_squared_error
preds_vs_actual = pd.DataFrame(columns = ['Actual', 'Predicted'])
preds_vs_actual ['Actual'] = full_data_prophet['y']



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Create a FB prophet model that successively predicts 20 day periods 
i = 0
while i < 1140: #for i in range(0,len(full_data_prophet)):
    model = Prophet()
    model.fit(full_data_prophet)
    future_dates = model.make_future_dataframe(periods=1, freq='D') 
    future_20 = future_dates.iloc[i:i+20]
    forecast = model.predict(future_20);
    for j in range(0,20):
        preds_vs_actual['Predicted'][j+i] = forecast['trend'][j]
    i=i+20

    

In [None]:
print(preds_vs_actual)

In [None]:
mse = mean_squared_error (preds_vs_actual['Actual'], preds_vs_actual['Predicted'])
print(mse)

## Conclusion

Both models predicted 20 business days into the future, and Facebook Prophet (MSE = 77.25) performed significantly worse than an XGBoost model (MSE = 33.30) with a 120 day rolling training window. 