In [1]:
import logging
import numpy as np
import pandas as pd
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly
from plotly import graph_objs as go
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
pd.options.mode.chained_assignment = None
data = pd.read_csv("csv/82000278_Toamnei_2022_05.csv")
data

Unnamed: 0,time,latitude,longitude,altitude,timelocal,temperature,pressure,humidity,voc,noise,co2,ch2o,o3,pm1,pm25,pm10,readable time,day
0,1651363204,45.651464,25.615426,538,914160,6.73,95569,76,215841,43,599,7,20,7,9,10,01-05-22 00:00,01-05-22
1,1651363264,45.651464,25.615426,538,914220,6.71,95569,76,213691,50,601,7,20,7,9,10,01-05-22 00:01,01-05-22
2,1651363324,45.651464,25.615426,538,914280,6.70,95570,76,211822,43,601,7,20,7,9,10,01-05-22 00:02,01-05-22
3,1651363384,45.651464,25.615426,538,914340,6.69,95568,76,206437,42,600,7,20,7,9,10,01-05-22 00:03,01-05-22
4,1651363444,45.651464,25.615426,538,914400,6.67,95568,77,206428,45,602,7,20,7,9,10,01-05-22 00:04,01-05-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44511,1654041310,45.651464,25.615426,538,1585140,13.53,95191,83,208469,43,590,7,20,8,10,11,31-05-22 23:55,31-05-22
44512,1654041370,45.651464,25.615426,538,1585200,13.52,95188,83,207676,45,595,7,20,8,10,11,31-05-22 23:56,31-05-22
44513,1654041430,45.651464,25.615426,538,1585260,13.52,95190,83,206661,37,590,7,20,7,9,10,31-05-22 23:57,31-05-22
44514,1654041490,45.651464,25.615426,538,1585320,13.52,95192,83,205206,43,592,7,20,7,9,10,31-05-22 23:58,31-05-22


In [3]:
# drop Nan columns and indexes
data.dropna(axis='columns', how='all', inplace=True)
data.dropna(axis='index', how='all', inplace=True)
data

Unnamed: 0,time,latitude,longitude,altitude,timelocal,temperature,pressure,humidity,voc,noise,co2,ch2o,o3,pm1,pm25,pm10,readable time,day
0,1651363204,45.651464,25.615426,538,914160,6.73,95569,76,215841,43,599,7,20,7,9,10,01-05-22 00:00,01-05-22
1,1651363264,45.651464,25.615426,538,914220,6.71,95569,76,213691,50,601,7,20,7,9,10,01-05-22 00:01,01-05-22
2,1651363324,45.651464,25.615426,538,914280,6.70,95570,76,211822,43,601,7,20,7,9,10,01-05-22 00:02,01-05-22
3,1651363384,45.651464,25.615426,538,914340,6.69,95568,76,206437,42,600,7,20,7,9,10,01-05-22 00:03,01-05-22
4,1651363444,45.651464,25.615426,538,914400,6.67,95568,77,206428,45,602,7,20,7,9,10,01-05-22 00:04,01-05-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44511,1654041310,45.651464,25.615426,538,1585140,13.53,95191,83,208469,43,590,7,20,8,10,11,31-05-22 23:55,31-05-22
44512,1654041370,45.651464,25.615426,538,1585200,13.52,95188,83,207676,45,595,7,20,8,10,11,31-05-22 23:56,31-05-22
44513,1654041430,45.651464,25.615426,538,1585260,13.52,95190,83,206661,37,590,7,20,7,9,10,31-05-22 23:57,31-05-22
44514,1654041490,45.651464,25.615426,538,1585320,13.52,95192,83,205206,43,592,7,20,7,9,10,31-05-22 23:58,31-05-22


In [4]:
# convert to date format
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data

Unnamed: 0,time,latitude,longitude,altitude,timelocal,temperature,pressure,humidity,voc,noise,co2,ch2o,o3,pm1,pm25,pm10,readable time,day
0,1651363204,45.651464,25.615426,538,914160,6.73,95569,76,215841,43,599,7,20,7,9,10,01-05-22 00:00,2022-05-01
1,1651363264,45.651464,25.615426,538,914220,6.71,95569,76,213691,50,601,7,20,7,9,10,01-05-22 00:01,2022-05-01
2,1651363324,45.651464,25.615426,538,914280,6.70,95570,76,211822,43,601,7,20,7,9,10,01-05-22 00:02,2022-05-01
3,1651363384,45.651464,25.615426,538,914340,6.69,95568,76,206437,42,600,7,20,7,9,10,01-05-22 00:03,2022-05-01
4,1651363444,45.651464,25.615426,538,914400,6.67,95568,77,206428,45,602,7,20,7,9,10,01-05-22 00:04,2022-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44511,1654041310,45.651464,25.615426,538,1585140,13.53,95191,83,208469,43,590,7,20,8,10,11,31-05-22 23:55,2022-05-31
44512,1654041370,45.651464,25.615426,538,1585200,13.52,95188,83,207676,45,595,7,20,8,10,11,31-05-22 23:56,2022-05-31
44513,1654041430,45.651464,25.615426,538,1585260,13.52,95190,83,206661,37,590,7,20,7,9,10,31-05-22 23:57,2022-05-31
44514,1654041490,45.651464,25.615426,538,1585320,13.52,95192,83,205206,43,592,7,20,7,9,10,31-05-22 23:58,2022-05-31


In [5]:
# modify name with any sensor name from df
sensor_name = 'co2'

In [6]:
# sort dates by day
data = data.sort_values(by=['day'])
print("sorted days", data.day)

sorted days 0       2022-05-01
964     2022-05-01
963     2022-05-01
962     2022-05-01
961     2022-05-01
           ...    
43551   2022-05-31
43550   2022-05-31
43549   2022-05-31
43547   2022-05-31
44515   2022-05-31
Name: day, Length: 44516, dtype: datetime64[ns]


In [7]:
group_by_df = pd.DataFrame(
    [name, group.mean()[sensor_name]] for name, group in data.groupby('day')
)

group_by_df.columns = ['day', sensor_name]
group_by_df

  [name, group.mean()[sensor_name]] for name, group in data.groupby('day')
  [name, group.mean()[sensor_name]] for name, group in data.groupby('day')


Unnamed: 0,day,co2
0,2022-05-01,530.268056
1,2022-05-02,530.8966
2,2022-05-03,552.383369
3,2022-05-04,559.230428
4,2022-05-05,541.918694
5,2022-05-06,530.807531
6,2022-05-07,525.917361
7,2022-05-08,519.096528
8,2022-05-09,573.782639
9,2022-05-10,609.684503


In [8]:
# group df by day
grp_date = data.groupby('day')
# calculate mean value  for every given day
data = pd.DataFrame(grp_date.mean())
print("MEAN " + sensor_name + " values by day\n", data[sensor_name])

MEAN co2 values by day
 day
2022-05-01    530.268056
2022-05-02    530.896600
2022-05-03    552.383369
2022-05-04    559.230428
2022-05-05    541.918694
2022-05-06    530.807531
2022-05-07    525.917361
2022-05-08    519.096528
2022-05-09    573.782639
2022-05-10    609.684503
2022-05-11    604.139583
2022-05-12    536.984039
2022-05-13    542.491655
2022-05-14    588.485734
2022-05-15    593.161806
2022-05-16    572.925643
2022-05-17    552.017361
2022-05-18    597.295833
2022-05-19    571.890896
2022-05-20    525.332408
2022-05-21    529.840278
2022-05-22    563.437804
2022-05-23    574.691667
2022-05-24    560.273992
2022-05-25    525.462448
2022-05-26    546.131250
2022-05-27    531.998611
2022-05-28    556.725694
2022-05-29    522.679167
2022-05-30    523.806944
2022-05-31    524.766667
Name: co2, dtype: float64


In [9]:
# select needed data
data = data[[sensor_name]]
data

Unnamed: 0_level_0,co2
day,Unnamed: 1_level_1
2022-05-01,530.268056
2022-05-02,530.8966
2022-05-03,552.383369
2022-05-04,559.230428
2022-05-05,541.918694
2022-05-06,530.807531
2022-05-07,525.917361
2022-05-08,519.096528
2022-05-09,573.782639
2022-05-10,609.684503


In [10]:
# boxplot values to eliminate outliers
upper_quartile = np.percentile(data[sensor_name], 75)
lower_quartile = np.percentile(data[sensor_name], 25)
iqr = upper_quartile - lower_quartile
upper_whisker = data[sensor_name][data[sensor_name] <= upper_quartile + 1.5 * iqr].max()
lower_whisker = data[sensor_name][data[sensor_name] >= lower_quartile - 1.5 * iqr].min()

In [11]:
print(upper_quartile)
print(lower_quartile)
print(iqr)
print(upper_whisker)
print(lower_whisker)

572.4082696316887
530.0541666666667
42.35410296502198
609.6845031271716
519.0965277777777


In [12]:
# start using prophet
logging.getLogger().setLevel(logging.ERROR)

In [13]:
# create df for prophet
df = data.reset_index()
df.columns = ['ds', 'y']
df

Unnamed: 0,ds,y
0,2022-05-01,530.268056
1,2022-05-02,530.8966
2,2022-05-03,552.383369
3,2022-05-04,559.230428
4,2022-05-05,541.918694
5,2022-05-06,530.807531
6,2022-05-07,525.917361
7,2022-05-08,519.096528
8,2022-05-09,573.782639
9,2022-05-10,609.684503


In [14]:
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)

In [15]:
# X
# y
# X_train
# X_test
# y_train
# y_test

In [16]:
# create dataframe containing only train values
dff = pd.DataFrame(index=range(0, len(y_train)))

dff['ds'] = group_by_df['day'][:len(y_train)]
dff['y'] = group_by_df[sensor_name][:len(y_train)]
dff

Unnamed: 0,ds,y
0,2022-05-01,530.268056
1,2022-05-02,530.8966
2,2022-05-03,552.383369
3,2022-05-04,559.230428
4,2022-05-05,541.918694
5,2022-05-06,530.807531
6,2022-05-07,525.917361
7,2022-05-08,519.096528
8,2022-05-09,573.782639
9,2022-05-10,609.684503


In [17]:
m = Prophet()
# fit train values to prophet
m.fit(dff)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
  components = components.append(new_comp)
INFO:fbprophet:n_changepoints greater than number of observations. Using 19.


<fbprophet.forecaster.Prophet at 0x1d2e4135880>

In [18]:
# predict whole month
future = m.make_future_dataframe(periods=len(y_test))
forecast = m.predict(future)
print('forecast', forecast)

  components = components.append(new_comp)
  components = components.append(new_comp)


forecast            ds       trend  yhat_lower  yhat_upper  trend_lower  trend_upper  \
0  2022-05-01  534.033153  508.536623  555.972335   534.033153   534.033153   
1  2022-05-02  536.876823  522.584411  571.040287   536.876823   536.876823   
2  2022-05-03  539.720492  530.422582  579.339508   539.720492   539.720492   
3  2022-05-04  542.564161  534.285670  583.469214   542.564161   542.564161   
4  2022-05-05  545.407830  514.954622  566.224937   545.407830   545.407830   
5  2022-05-06  548.251499  499.687326  545.040446   548.251499   548.251499   
6  2022-05-07  551.095167  514.511054  563.253391   551.095167   551.095167   
7  2022-05-08  553.938836  530.332437  576.621231   553.938836   553.938836   
8  2022-05-09  556.782505  544.016254  589.479605   556.782505   556.782505   
9  2022-05-10  559.626162  549.977580  599.362550   559.626162   559.626162   
10 2022-05-11  562.459950  555.555864  603.636758   562.459950   562.459950   
11 2022-05-12  565.291056  537.540724  585.

In [19]:
# print only values of interest
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']])

           ds        yhat  yhat_lower  yhat_upper
0  2022-05-01  532.802422  508.536623  555.972335
1  2022-05-02  546.863545  522.584411  571.040287
2  2022-05-03  554.858082  530.422582  579.339508
3  2022-05-04  560.282175  534.285670  583.469214
4  2022-05-05  540.464823  514.954622  566.224937
5  2022-05-06  522.907686  499.687326  545.040446
6  2022-05-07  539.770391  514.511054  563.253391
7  2022-05-08  552.708104  530.332437  576.621231
8  2022-05-09  566.769228  544.016254  589.479605
9  2022-05-10  574.763752  549.977580  599.362550
10 2022-05-11  580.177965  555.555864  603.636758
11 2022-05-12  560.348050  537.540724  585.003663
12 2022-05-13  542.778032  518.399338  565.543507
13 2022-05-14  559.623331  535.885619  583.696036
14 2022-05-15  567.624514  543.199296  590.301449
15 2022-05-16  576.731503  553.891606  601.562044
16 2022-05-17  579.769482  556.368381  604.863783
17 2022-05-18  580.235334  555.202637  604.436308
18 2022-05-19  555.458847  531.552570  578.735106


In [20]:
# plot predictions
fig = plot_plotly(m, forecast)
fig.update_layout(
    title=sensor_name + ' forecast for May 2022',
    xaxis_title="Day",
    yaxis_title=sensor_name)
fig.show()

In [21]:
# check if there is seasonality+trend
fig = plot_components_plotly(m, forecast)
fig.update_layout(
    title=sensor_name + " seasonality"
)
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [22]:
# define a function to make a df containing the prediction and the actual values
def make_comparison_dataframe(historical, forecast):
    return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))

In [23]:
# modify dff so that mse can be calculated for each value of the dataframe
dff['ds'] = group_by_df['day']
dff['y'] = group_by_df[sensor_name]
dff

Unnamed: 0,ds,y
0,2022-05-01,530.268056
1,2022-05-02,530.8966
2,2022-05-03,552.383369
3,2022-05-04,559.230428
4,2022-05-05,541.918694
5,2022-05-06,530.807531
6,2022-05-07,525.917361
7,2022-05-08,519.096528
8,2022-05-09,573.782639
9,2022-05-10,609.684503


In [24]:
cmp_df = make_comparison_dataframe(df, forecast)
cmp_df

Unnamed: 0_level_0,yhat,yhat_lower,yhat_upper,y
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-05-01,532.802422,508.536623,555.972335,530.268056
2022-05-02,546.863545,522.584411,571.040287,530.8966
2022-05-03,554.858082,530.422582,579.339508,552.383369
2022-05-04,560.282175,534.28567,583.469214,559.230428
2022-05-05,540.464823,514.954622,566.224937,541.918694
2022-05-06,522.907686,499.687326,545.040446,530.807531
2022-05-07,539.770391,514.511054,563.253391,525.917361
2022-05-08,552.708104,530.332437,576.621231,519.096528
2022-05-09,566.769228,544.016254,589.479605,573.782639
2022-05-10,574.763752,549.97758,599.36255,609.684503


In [25]:
# add new column with default value
cmp_df['outlier_detected'] = 0
for i in range(len(cmp_df)):
    # detect outliers
    if (cmp_df['y'][i] > cmp_df['yhat_upper'][i] or cmp_df['y'][i] < cmp_df['yhat_lower'][i]):
        cmp_df['outlier_detected'][i] = 1
    else:
        cmp_df['outlier_detected'][i] = 0

cmp_df

Unnamed: 0_level_0,yhat,yhat_lower,yhat_upper,y,outlier_detected
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-01,532.802422,508.536623,555.972335,530.268056,0
2022-05-02,546.863545,522.584411,571.040287,530.8966,0
2022-05-03,554.858082,530.422582,579.339508,552.383369,0
2022-05-04,560.282175,534.28567,583.469214,559.230428,0
2022-05-05,540.464823,514.954622,566.224937,541.918694,0
2022-05-06,522.907686,499.687326,545.040446,530.807531,0
2022-05-07,539.770391,514.511054,563.253391,525.917361,0
2022-05-08,552.708104,530.332437,576.621231,519.096528,1
2022-05-09,566.769228,544.016254,589.479605,573.782639,0
2022-05-10,574.763752,549.97758,599.36255,609.684503,1


In [26]:
# actual value
fig_data = go.Figure()
fig_data.add_trace(go.Scatter(
    x=group_by_df['day'],
    y=cmp_df['y'],
    name='y(actual value)',
    mode='lines+markers',
    line=dict(
        color='rgb(75,0,130)'),
    marker=dict(color=np.where(cmp_df['outlier_detected'] == 1, 'rgb(75,0,130)', 'rgb(75,0,130)'))))

fig_data.update_layout(title='pm10 values for May 2022', yaxis_title=sensor_name, xaxis_title='Day',
                  showlegend=True)
fig_data.show()

In [27]:
# plot forecast with upper and lower bound
fig = go.Figure()

In [28]:
# predicted value
fig.add_trace(go.Scatter(
    x=group_by_df['day'],
    y=cmp_df['yhat'],
    name='yhat(predicted value)',
    mode='lines+markers',
    line=dict(
        color='rgb(95,158,160)'),
    marker=dict(
        color='rgb(95,158,160)')
))

fig.update_layout(title='CO2 values for May 2022', yaxis_title=sensor_name, xaxis_title='Day',
                  showlegend=True)
fig.show()

In [29]:
# actual value
fig.add_trace(go.Scatter(
    x=group_by_df['day'],
    y=cmp_df['y'],
    name='y(actual value)',
    mode='lines+markers',
    line=dict(
        color='rgb(75,0,130)'),
    marker=dict(color=np.where(cmp_df['outlier_detected'] == 1, 'red', 'rgb(75,0,130)'))))

fig.update_layout(title='CO2 values for May 2022 and prediction values', yaxis_title=sensor_name, xaxis_title='Day',
                  showlegend=True)
fig.show()

In [30]:
# lower bound of predicted value
fig.add_trace(go.Scatter(
    x=group_by_df['day'],
    y=cmp_df['yhat_lower'],
    name='yhat_lower',
    mode='lines+markers',
    line=dict(
        color='rgb(205,92,92)'),
    marker=dict(
        color='rgb(205,92,92)')

))

In [31]:
# upper bound of predicted value
fig.add_trace(go.Scatter(
    x=group_by_df['day'],
    y=cmp_df['yhat_upper'],
    name='yhat_upper',
    mode='lines+markers',
    line=dict(
        color='rgb(65,105,225)'),
    marker=dict(
        color='rgb(65,105,225)')
))

fig.update_layout(title='Comparison between predicted values and real ones, including upper and lower values', yaxis_title=sensor_name, xaxis_title='Day',
                  showlegend=True)
fig.show()

In [32]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [33]:
cmp_df = cmp_df.dropna()

forecast_errors = [abs(cmp_df['y'][i] - cmp_df['yhat'][i]) for i in range(len(cmp_df))]
print('Forecast Errors: ', forecast_errors)
print('MAX Forecast Error: %s' % max(forecast_errors))
print('MIN Forecast Error: %s' % min(forecast_errors))

Forecast Errors:  [2.534365945242257, 15.966945132350816, 2.474713634823388, 1.0517470626314207, 1.4538706202883986, 7.899845060771327, 13.853029446639653, 33.61157668463113, 7.013411286190831, 34.92075065479685, 23.96161866872569, 23.364010969653464, 0.28637735534061903, 28.86240364060859, 25.537291377436873, 3.8058601790481816, 27.752120759244576, 17.060499131536744, 16.432049300307654, 7.610167980171582, 15.00586796908442, 10.613078887627694, 12.764952869701006, 4.6881250649106505, 39.96462753621506, 5.480661671077769, 13.864293913838537, 26.687807530418013, 15.337299643392043, 23.311510519656508, 25.387191220456998]
MAX Forecast Error: 39.96462753621506
MIN Forecast Error: 0.28637735534061903


In [34]:
rmse = np.sqrt(mean_squared_error(cmp_df['y'], cmp_df['yhat']))
print("MSE is ", mean_squared_error(cmp_df['y'], cmp_df['yhat']))
print("rmse is ", rmse)
print("r2 score ", r2_score(cmp_df['y'], cmp_df['yhat']))  # around 1

MSE is  368.0626783547101
rmse is  19.184959691245382
r2 score  0.4693846487981481


In [35]:
def correlation_line(df, x, y):
    scatter_data = go.Scattergl(
        x=df[x],
        y=df[y],
        mode='markers',
        name=x + ' and ' + y + ' correlation'
    )

    layout = go.Layout(
        xaxis=dict(
            title=x
        ),
        yaxis=dict(
            title=y)
    )

    # calculate best fit line
    denominator = (df[x] ** 2).sum() - df[x].mean() * df[x].sum()
    print('denominator', denominator)
    m = ((df[y] * df[x]).sum() - df[y].mean() * df[x].sum()) / denominator
    b = ((df[y].mean() * ((df[x] ** 2).sum())) - df[x].mean() * ((df[y] * df[x]).sum())) / denominator
    best_fit_line = m * df[x] + b

    best_fit_line = go.Scattergl(
        x=df[x],
        y=best_fit_line,
        name='Line of best fit',
        line=dict(
            color='red'
        )
    )

    data = [scatter_data, best_fit_line]
    figure = go.Figure(data=data, layout=layout)

    figure.show()


# yhat and y
correlation_line(cmp_df, cmp_df.columns[0], cmp_df.columns[3])

denominator 8594.162146987393
