In [88]:
#This is the implementation of the prophet model on our dataset
import pandas as pd
from prophet import Prophet

# Load data
df = pd.read_csv("archive/Stocks/aapl.us.txt")

# Rename columns to Prophet's expected names
df = df.rename(columns={"Date": "ds", "Close": "y"})

import datetime

# convert dates from string to numerical lowers MAE and MSE
df['ds'] = pd.to_datetime(df['ds'])
df['ds'] = df['ds'].apply(lambda x: x.timestamp())

In [89]:
#Use Random Forest Regression to find most important categories
from sklearn.ensemble import RandomForestRegressor

X = df.drop('y', axis=1)
y = df[:]['y']

# Train model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Determine feature importances
importances = rf.feature_importances_

# Print feature importances
for feature, importance in zip(X.columns, importances):
    print(feature, ':', importance)

ds : 0.008569638252394117
Open : 0.013539909786385751
High : 0.40417756938062516
Low : 0.5737040218033235
Volume : 8.860777271552187e-06
OpenInt : 0.0


In [90]:
#experiment with dropping features of low importance
X = X[['ds', 'Volume']]
print(X)

                ds    Volume
0     4.633632e+08  23220030
1     4.636224e+08  18022532
2     4.637088e+08  42498199
3     4.637952e+08  37125801
4     4.638816e+08  57822062
...            ...       ...
8359  1.509926e+09  34901241
8360  1.510013e+09  24424877
8361  1.510099e+09  24451166
8362  1.510186e+09  29533086
8363  1.510272e+09  25130494

[8364 rows x 2 columns]


In [114]:
#split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)

train = X_train
train['y'] = y_train

# Create future dataframe for dates in test data
test = pd.DataFrame({'ds': X_test['ds']})
print(test)

                ds
2412  7.642944e+08
6829  1.317946e+09
6592  1.288310e+09
5786  1.187568e+09
5391  1.138061e+09
...            ...
7844  1.445386e+09
459   5.206464e+08
7790  1.438733e+09
948   5.817312e+08
3700  9.253440e+08

[5019 rows x 1 columns]


In [115]:
# Initialize Prophet model
model = Prophet()

# Fit model to training data
model.fit(train)

# make predictions on the test data
predictions = model.predict(test)

""" # Create future dataframe for predictions
future = model.make_future_dataframe(periods=365)

# Make predictions
forecast = model.predict(future)

# Plot predictions
fig = model.plot(forecast) """

20:28:57 - cmdstanpy - INFO - Chain [1] start processing
20:28:59 - cmdstanpy - INFO - Chain [1] done processing


' # Create future dataframe for predictions\nfuture = model.make_future_dataframe(periods=365)\n\n# Make predictions\nforecast = model.predict(future)\n\n# Plot predictions\nfig = model.plot(forecast) '

In [116]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# evaluate performance
mae = mean_absolute_error(y_test, predictions['yhat'])
print("MAE:", mae)
mse = mean_squared_error(y_test, predictions['yhat'])
print("Mean Squared Error:", mse)

MAE: 32.3573004082172
Mean Squared Error: 2720.561378365433
