In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-stock-prices-may-2017-to-may-2022/AMZN.csv


### Loading Data and Set-Up

In [2]:
data = pd.read_csv('/kaggle/input/amazon-stock-prices-may-2017-to-may-2022/AMZN.csv')

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2017-06-01,49.929501,49.949501,49.568501,49.797501,49.797501,49096000
1,2017-06-02,49.949501,50.424,49.783501,50.336498,50.336498,75046000
2,2017-06-05,50.3615,50.6605,50.175499,50.567001,50.567001,54398000
3,2017-06-06,50.599998,50.825001,50.0625,50.150002,50.150002,66928000
4,2017-06-07,50.297501,50.512501,50.099998,50.503502,50.503502,56460000


In [4]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1259.0,1259.0,1259.0,1259.0,1259.0,1259.0
mean,112.71215,113.96225,111.290146,112.638289,112.638289,86610140.0
std,41.589328,42.096757,41.022137,41.521249,41.521249,41569220.0
min,47.0,47.431499,46.349998,46.93,46.93,17626000.0
25%,82.564251,83.682998,81.300998,82.778748,82.778748,58436000.0
50%,95.899002,96.611504,94.828499,95.449501,95.449501,74752000.0
75%,158.34375,159.924003,156.199997,158.086998,158.086998,102552000.0
max,187.199997,188.654007,184.839493,186.570496,186.570496,331300000.0


## Feature Engineering
- Adding date and time based features
- Adding rolling mean averages for close, high, low, volume
- Adding lag features for close 
- Removing columns from data set to avoid leakage - the purpose of the model is to predict the given day's stock close, and so data from that day (High, Low, etc.) would not be available yet for predictions. 

In [5]:

# adding date based features: day of week, day of month, day of year 

data['Date'] = pd.to_datetime(data['Date'])
data['Day of week'] = data['Date'].dt.weekday
data['Day of month'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# rolling averages for days = 7, 30, 90, 180, 365

data["7d Mean Close"] = data['Close'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Close"] = data['Close'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Close"] = data['Close'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Close"] = data['Close'].rolling(window = 120).mean() #120 business days = 6 mo

# adding lag features: Close on (date - 1), (date-2), 3,4,5,7,14,21,30,60,90,180, 365
data['Close-1'] = data['Close'].shift(periods=1)
data['Close-2'] = data['Close'].shift(periods=2)
data['Close-3'] = data['Close'].shift(periods=3)
data['Close-4'] = data['Close'].shift(periods=4)
data['Close-5'] = data['Close'].shift(periods=5)
data['Close-7'] = data['Close'].shift(periods=7)
data['Close-14'] = data['Close'].shift(periods=14)
data['Close-21'] = data['Close'].shift(periods=21)
data['Close-30'] = data['Close'].shift(periods=30)
data['Close-60'] = data['Close'].shift(periods=60)
data['Close-90'] = data['Close'].shift(periods=90)

# Adding rolling averages for high, low and volume
data["7d Mean High"] = data['High'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean High"] = data['High'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean High"] = data['High'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean High"] = data['High'].rolling(window = 120).mean() #120 business days = 6 mo

data["7d Mean Low"] = data['Low'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Low"] = data['Low'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Low"] = data['Low'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Low"] = data['Low'].rolling(window = 120).mean() #120 business days = 6 mo

data["7d Mean Volume"] = data['Volume'].rolling(window = 5).mean() # 5 business days = 1 wk
data["30d Mean Volume"] = data['Volume'].rolling(window = 20).mean() # 20 business days = 1 mo
data["90d Mean Volume"] = data['Volume'].rolling(window = 60).mean() # 60 business days = 3 mo
data["120d Mean Volume"] = data['Volume'].rolling(window = 120).mean() #120 business days = 6 mo


# Will only train models on data with all features intact, so will exclude the first 120 data points
full_data = data.drop(index = range(119)) 
full_data = full_data.reset_index(drop=True)

#### Dropping Features to Limit Data Leakage

In [6]:
# To avoid leakage must drop features that will be unavailable at time of prediction
X = full_data.drop(columns = ['Close', 'Adj Close', 'Open', 'High', 'Low', 'Volume']) 
X ['index'] = X.index
Y = full_data['Close']


#### Creating a simple XGBoost Regression Model 
This will have a high error value but will serve as a comparison for the cross-validated regression and prophet models

In [7]:
## Creating a sample fold (must allocate indexes manually to prevent shuffling)
X_train = X.iloc[0:800]
X_test = X.iloc[800:1139]
Y_train = Y.iloc[0:800]
Y_test = Y.iloc[800:1139]

# Categorical data must be removed for the XGBregressor, however index will be in place as date (eg. 2017-11-17 = 0)
X_train = X_train.drop(columns = ['Date']).to_numpy()
X_test = X_test.drop(columns = ['Date']).to_numpy()
Y_train = Y_train.drop(columns = ['Date']).to_numpy()
Y_test = Y_test.drop(columns = ['Date']).to_numpy()


In [8]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
model = XGBRegressor()
model.fit(X_train, Y_train)
preds = model.predict(X_test)

mse_xgboost = mean_squared_error (preds, Y_test)
print(mse_xgboost)


76.80376326906087


# Adding Cross Validation using an Expanding Window 
## After creating the folds, will build regressive models
Using an expanding window to train the model - ExpandingWindowSplitter  generates folds across an sliding window. The length of the training series grows with time, with each subsequent fold retaining the full series history up to that point. The testing series remains the same length throughout. 

from sktime.forecasting.model_selection import ExpandingWindowSplitter
cv = ExpandingWindowSplitter(window_length=100, fh=[1, 2, 3])

n_splits = cv.get_n_splits(X)
print(f"Number of Folds = {n_splits}")


# Facebook Prophet Approach 

In [9]:
!pip install prophet;

[0m

In [10]:
from prophet import Prophet
# dropping cols to avoid leakage
full_data_prophet = full_data.drop(columns = ['Adj Close']) 
full_data_prophet.rename(columns={'Date':'ds','Close':'y'},inplace=True)
full_data_prophet['ds'] = pd.to_datetime(full_data_prophet['ds'])


In [11]:
# we will use a rolling 120 day prophet model to predict the following day 
from sklearn.metrics import mean_squared_error
preds_vs_actual = pd.DataFrame(columns = ['Actual', 'Predicted'], index = [0,1,2,3,4,5,6,7,8,9])

for i in range(0,10): #for i in range(0,len(full_data_prophet)):
    rolling_train_data = full_data_prophet.iloc[0+i:240+i,:] #0-120, 1-121, ...
    model = Prophet()
    model.fit(full_data_prophet)
    days_left = len(full_data_prophet) - i 
    future_dates = model.make_future_dataframe(periods=1, freq='D') # MUST CHANGE to get only next day pred
    forecast = model.predict(future_dates)
    preds_vs_actual['Actual'][i] = Y[i]
    preds_vs_actual['Predicted'][i]= forecast['yhat'][0]
    i=i+1
    
preds_vs_actual.head()
mse = mean_squared_error (preds_vs_actual['Actual'], preds_vs_actual['Predicted'])

05:32:55 - cmdstanpy - INFO - Chain [1] start processing
05:32:56 - cmdstanpy - INFO - Chain [1] done processing
05:32:57 - cmdstanpy - INFO - Chain [1] start processing
05:32:57 - cmdstanpy - INFO - Chain [1] done processing
05:32:58 - cmdstanpy - INFO - Chain [1] start processing
05:32:58 - cmdstanpy - INFO - Chain [1] done processing
05:32:59 - cmdstanpy - INFO - Chain [1] start processing
05:32:59 - cmdstanpy - INFO - Chain [1] done processing
05:33:00 - cmdstanpy - INFO - Chain [1] start processing
05:33:01 - cmdstanpy - INFO - Chain [1] done processing
05:33:02 - cmdstanpy - INFO - Chain [1] start processing
05:33:02 - cmdstanpy - INFO - Chain [1] done processing
05:33:03 - cmdstanpy - INFO - Chain [1] start processing
05:33:03 - cmdstanpy - INFO - Chain [1] done processing
05:33:04 - cmdstanpy - INFO - Chain [1] start processing
05:33:05 - cmdstanpy - INFO - Chain [1] done processing
05:33:05 - cmdstanpy - INFO - Chain [1] start processing
05:33:06 - cmdstanpy - INFO - Chain [1]

In [12]:
print(mse)

5.320997580131726
