In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error

sys.path.append("..")

from utils import create_onedrive_directdownload

In [2]:
url = "https://1drv.ms/u/s!AiqdXCxPTydhoocrhOcEwpkHEMcWVw?e=EiL5mL"
file_url = create_onedrive_directdownload(url)
df = pd.read_csv(file_url, index_col="Date", parse_dates=True)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600
2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900
2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400
2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100
2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800


In [3]:
# naive forecast is the forecast where we predict the previous value
df["ClosePrediction"] = df["Close"].shift(1)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,ClosePrediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600,
2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900,113.330002
2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400,113.629997
2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100,113.709999
2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800,114.190002


In [5]:
y_true = df.iloc[1:]["Close"]
y_pred = df.iloc[1:]["ClosePrediction"]

## Metrics

Main idea: get a feel for how the values relate to one another. What's "good"? What's "bad"? If the $R^2$ is "good", will the MAE also be "good"?

In [6]:
## Sum of Squared Error
(y_true - y_pred).dot(y_true - y_pred)

6330.3742894926045

In [7]:
## Mean Squared Error
mean_squared_error(y_true, y_pred)

2.798573956451196

In [8]:
## Mean Squared Error
(y_true - y_pred).dot(y_true - y_pred) / len(y_true)

2.7985739564511958

In [9]:
## Root Mean Squared Error
mean_squared_error(y_true, y_pred, squared=False)

1.672893886787562

In [10]:
# Root Mean Squared Error
np.sqrt((y_true - y_pred).dot(y_true - y_pred) / len(y_true))

1.6728938867875618

In [11]:
# Mean Absolute Error
mean_absolute_error(y_true, y_pred)

1.1457559803120336

In [12]:
## R^2

# Wow, what a great prediction!
# Why is it so good? Be suspicious...
r2_score(y_true, y_pred)

0.9989603259063914

In [13]:
## Mean Absolute Percentage Error
mean_absolute_percentage_error(y_true, y_pred)

0.006494073151422373

### sMAPE

$$\large E = \frac{1}{N}\sum_{i=1}^{N}\frac{| y_i - \hat{y_i} |}{ (| y_i | + | \hat{y_i} |)/2 }$$

In [16]:
# sMAPE

# Not implemented, oh well...
# Good thing we are brave and know how to implement things!
def smape(y_true, y_pred):
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred))/2
    ratio = numerator/denominator
    return ratio.mean()

smape(y_true, y_pred)

0.006491365814068417