In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_absolute_percentage_error,r2_score
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.diagnostics import performance_metrics
from prophet.diagnostics import cross_validation

In [15]:
df = pd.read_csv('datasets/Ibovespa14anos.csv',parse_dates=['Data'])
display(df.head(20))





Unnamed: 0,Data,Último,Abertura,Máxima,Mínima,Vol.,Var%
0,2024-05-24,124.306,124.731,125.257,124.259,"9,21M","-0,34%"
1,2024-05-23,124.729,125.65,125.665,124.431,"9,99M","-0,73%"
2,2024-05-22,125.65,127.412,127.412,125.524,"12,40M","-1,38%"
3,2024-05-21,127.412,127.754,128.272,127.205,"9,14M","-0,27%"
4,2024-05-20,127.751,128.151,128.73,127.488,"9,34M","-0,31%"
5,2024-05-17,128.151,128.28,128.464,127.696,"10,26M","-0,10%"
6,2024-05-16,128.284,128.029,128.965,127.922,"9,86M","0,20%"
7,2024-05-15,128.028,128.514,128.646,127.029,"10,86M","-0,38%"
8,2024-05-14,128.515,128.155,128.965,127.962,"11,89M","0,28%"
9,2024-05-13,128.155,127.6,128.669,127.599,"8,92M","0,44%"


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3566 entries, 0 to 3565
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Data      3566 non-null   datetime64[ns]
 1   Último    3566 non-null   float64       
 2   Abertura  3566 non-null   float64       
 3   Máxima    3566 non-null   float64       
 4   Mínima    3566 non-null   float64       
 5   Vol.      3565 non-null   object        
 6   Var%      3566 non-null   object        
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 195.1+ KB


In [17]:
df_data = df[['Data','Último']]

In [18]:
df_data.rename(columns={'Data':'ds','Último':'y'},inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
df_data.head()

Unnamed: 0,ds,y
0,2024-05-24,124.306
1,2024-05-23,124.729
2,2024-05-22,125.65
3,2024-05-21,127.412
4,2024-05-20,127.751


In [20]:
model = Prophet(interval_width=0.95)

In [21]:
model.fit(df_data)

21:02:49 - cmdstanpy - INFO - Chain [1] start processing
21:02:50 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2271c6b16d0>

In [22]:
future = model.make_future_dataframe(periods=7, freq='D')
future.head()

Unnamed: 0,ds
0,2010-01-04
1,2010-01-05
2,2010-01-06
3,2010-01-07
4,2010-01-08


In [23]:
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
3568,2024-05-27,118.41607,106.337982,131.396579
3569,2024-05-28,118.425716,106.38365,131.153316
3570,2024-05-29,118.437488,105.322273,131.275631
3571,2024-05-30,118.451362,106.005132,130.450759
3572,2024-05-31,118.502985,105.85457,131.126828


In [24]:
plot_plotly(model, forecast)

In [25]:
df_cv = cross_validation(model, initial='730 days', period='180 days', horizon = '30 days')
df_cv.head()


  0%|          | 0/25 [00:00<?, ?it/s]21:02:51 - cmdstanpy - INFO - Chain [1] start processing
21:02:51 - cmdstanpy - INFO - Chain [1] done processing
  4%|▍         | 1/25 [00:00<00:09,  2.47it/s]21:02:51 - cmdstanpy - INFO - Chain [1] start processing
21:02:52 - cmdstanpy - INFO - Chain [1] done processing
  8%|▊         | 2/25 [00:00<00:08,  2.71it/s]21:02:52 - cmdstanpy - INFO - Chain [1] start processing
21:02:52 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▏        | 3/25 [00:01<00:08,  2.75it/s]21:02:52 - cmdstanpy - INFO - Chain [1] start processing
21:02:52 - cmdstanpy - INFO - Chain [1] done processing
 16%|█▌        | 4/25 [00:01<00:07,  2.63it/s]21:02:52 - cmdstanpy - INFO - Chain [1] start processing
21:02:53 - cmdstanpy - INFO - Chain [1] done processing
 20%|██        | 5/25 [00:01<00:06,  3.02it/s]21:02:53 - cmdstanpy - INFO - Chain [1] start processing
21:02:53 - cmdstanpy - INFO - Chain [1] done processing
 24%|██▍       | 6/25 [00:02<00:06,  2.92it/s]21:02:53

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,cutoff
0,2012-06-27,54.95083,51.917275,58.086488,53.109,2012-06-26
1,2012-06-28,54.841285,52.021593,57.738299,52.652,2012-06-26
2,2012-06-29,54.719375,51.476393,57.653526,54.355,2012-06-26
3,2012-07-02,54.537351,51.288899,57.677183,54.693,2012-06-26
4,2012-07-03,54.461175,51.636587,57.494584,55.78,2012-06-26


In [26]:
df_p = performance_metrics(df_cv)
df_p.tail(30)

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,4 days,64.722873,8.045053,6.093203,0.080842,0.075657,0.078773,0.6
1,5 days,73.235229,8.557758,6.528238,0.085586,0.078976,0.082731,0.6
2,6 days,70.132106,8.374491,6.361311,0.081714,0.071494,0.078943,0.68
3,7 days,64.009047,8.000565,5.912527,0.07479,0.064572,0.072504,0.770588
4,8 days,52.395438,7.238469,5.345756,0.066431,0.060257,0.065658,0.834118
5,9 days,44.110701,6.641589,5.034479,0.062908,0.049702,0.063405,0.81625
6,10 days,43.810237,6.61893,5.203684,0.065752,0.052365,0.066265,0.742353
7,11 days,56.4485,7.513222,5.971447,0.075914,0.065776,0.075698,0.64
8,12 days,65.224567,8.076173,6.501515,0.081583,0.069292,0.080509,0.587778
9,13 days,70.350649,8.387529,6.902426,0.08535,0.075477,0.08348,0.572222


In [44]:
df_new = df_data.sort_index(ascending=False)

In [45]:
df_new.head()

Unnamed: 0,ds,y
3565,2010-01-04,70.045
3564,2010-01-05,70.24
3563,2010-01-06,70.729
3562,2010-01-07,70.451
3561,2010-01-08,70.263


In [47]:
metric_df = forecast.set_index('ds')[['yhat']].join(df_new.set_index('ds').y).reset_index()

In [48]:
metric_df.dropna(inplace=True)

In [49]:
r2_score(metric_df.y, metric_df.yhat)

0.9404032692185523

In [50]:
mean_squared_error(metric_df.y, metric_df.yhat)

40.66810832836686

In [51]:
mean_absolute_error(metric_df.y, metric_df.yhat)

4.618073597368348