In [None]:
'''importing module'''
import numpy as np 
import pandas as pd
from fbprophet import Prophet
import matplotlib                  
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import seaborn as sns
sns.set()
#%matplotlib inline


# Introduction

**The purpose of this analysis is to verify the accuracy of the Facebook Prophet algorithm with our data. **  
**Due to the fact that our project did not collect enough data, we used similar data from open sources. This should be enough for preliminary analysis.**  
https://www.kaggle.com/bappekim/air-pollution-in-seoul

In [None]:
'''Download dataset'''
air_pollution_data = pd.read_csv('../input/air-pollution-101/Air_pollution_101.csv')
air_pollution_data

## Check and analyze our data

In [None]:
'''Check main information about dataset'''
air_pollution_data.info()

In [None]:
air_pollution_data.describe()

**All dataset is 25862 measuresments  
We see that "Measurement date" kept as object (not date)  
Some values 100 times more than mean (outliers in data)**

In [None]:
elements = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']

In [None]:
'''Cast Measurement date type to date'''
air_pollution_data['Measurement date'] = pd.to_datetime(air_pollution_data['Measurement date'])
air_pollution_data

## Divide the values into training and test samples. After that we can start evaluate the training data

In [None]:
sep_point = round(len(air_pollution_data) * 0.9)
train_data = air_pollution_data.iloc[: sep_point, :]
test_data = air_pollution_data.iloc[sep_point :, :]
train_data

In [None]:
'''Let's check if there are missing measurements in our data'''
first_date = train_data['Measurement date'].min()
last_date = train_data['Measurement date'].max()
first_date, last_date

In [None]:
full_period = pd.date_range(start=first_date, end=last_date, freq='H')
full_period, len(full_period)

**If there were no gaps in our data, we would have 23676 measuresments. But we have only 23277. So 399 measuresments were lost. We need to fill this values**

In [None]:
full_period_table = pd.DataFrame(full_period, columns=['Measurement date'])
train_data = full_period_table.merge(train_data, left_on='Measurement date', right_on='Measurement date', how='outer')
train_data

**To fill missing values we hypothesize that the missing value is equal to the same value 24 hours ago**

In [None]:
def fill_nan_val(df, elements):
    for element in elements:
        nan_index = df[pd.isnull(df[element])].index
        for i in nan_index:
            if i > 24:
                df.loc[i, element] = df.loc[i-24, element]
            else:
                df.loc[i, element] = df.iloc[i-1, element]

In [None]:
fill_nan_val(train_data, elements)
train_data.info()

## Let's clear the data from splashes

In [None]:
quantile_99 = train_data[elements].quantile(0.99)
quantile_99

In [None]:
def del_splashes(df, elements):
    for element in elements:
        df.loc[df[element] > quantile_99[element], element] = quantile_99[element]

In [None]:
del_splashes(train_data, elements)
train_data.describe()

## Let's take a close look on prediction of SO2 with Facebook Prophet algorithm. 

In [None]:
so2_data_train = train_data[['Measurement date', 'SO2']].copy()
so2_data_test = test_data[['Measurement date', 'SO2']].copy()

In [None]:
'''Change columns names according to fbprophed requirements'''
so2_data_train.columns = ['ds','y']
so2_data_test.columns = ['ds','y']

In [None]:
'''Study the model'''
model_so2=Prophet()
model_so2.fit(so2_data_train)


In [None]:
future_so2 = so2_data_test[['ds']]
future_so2.head()

In [None]:
'''model prediction'''
pred_so2 = model_so2.predict(future_so2)
pred_so2.head()

## Let's compare the predicted data with the actual data on the graph.

In [None]:
def comparison(x1, y1, x2, y2, y_lim):
    plt.figure(figsize=(16, 8))
    sns.lineplot(x=x1, y=y1, linewidth=1)
    sns.lineplot(x=x2, y=y2, linewidth=1, color='red')
    plt.ylim(0, y_lim)

In [None]:
comparison(so2_data_test['ds'], so2_data_test['y'], pred_so2['ds'], pred_so2['yhat'], y_lim=0.01)

## Also display the analytics of Facebook

In [None]:
model_so2.plot_components(pred_so2, figsize=(13, 15))

## Сalculate the errors

In [None]:
mse_so2 = mean_squared_error(so2_data_test['y'], pred_so2['yhat'])
mae_so2 = mean_absolute_error(so2_data_test['y'], pred_so2['yhat'])
mape_so2 = mean_absolute_percentage_error(so2_data_test['y'], pred_so2['yhat']) * 100
mse_so2, mae_so2, mape_so2

### Error about 30%

# Let's build an automatic solution for predicting other parameters for different time intervals

In [None]:
def full_prediction(data, element, n_last):
    air_pollution_data = data[['Measurement date', element]]
    train_data = air_pollution_data.iloc[: -n_last, :]
    test_data = air_pollution_data.iloc[-n_last :, :]
    first_date = train_data['Measurement date'].min()
    last_date = train_data['Measurement date'].max()
    full_period = pd.date_range(start=first_date, end=last_date, freq='H')
    train_data = full_period_table.merge(train_data, left_on='Measurement date', right_on='Measurement date', how='outer')
    fill_nan_val(train_data, [element])
    quantile_99 = train_data[element].quantile(0.99)
    del_splashes(train_data, [element])
    
    train_data.columns = ['ds','y']
    test_data.columns = ['ds','y']
    
    model = Prophet()
    model.fit(train_data)
    future = test_data[['ds']]
    pred = model.predict(future)
    
    mse = mean_squared_error(test_data['y'], pred['yhat'])
    mae = mean_absolute_error(test_data['y'], pred['yhat'])
    mape = mean_absolute_percentage_error(test_data['y'], pred['yhat']) * 100
    
    return mse, mae, mape

## And will combine all MAPE (mean absolute percentage error) errors in one dataset

In [None]:
def mape_comparison(data, elements, n_predictions):
    df = pd.DataFrame(columns=n_predictions, index=elements)
    for n in n_predictions:
        for element in elements:
            _, _, mape = full_prediction(data, element, n)
            df.loc[element, n] = mape
            
    return df
            

In [None]:
mape_errors = mape_comparison(air_pollution_data, elements, [24, 72, 168, 720, 1440])
mape_errors

In [None]:
mape_errors = mape_errors.astype('float')
plt.figure(figsize=(16, 8))
x = mape_errors.columns
elements = mape_errors.index
for element in elements:
    sns.lineplot(x=x, y=mape_errors.loc[element])
plt.legend(elements)
plt.xlabel("Hours")
plt.ylabel("MAPE, %")
plt.title('Dependence of MAPE on prediction length')

# Conclusion
**Algorithm Fachebook Prophet can be used for preliminary analysis and prediction of the data.  
Some values can be predicted quite well. For example SO2 with MAPE 20-35%  
Other has error higher than 100%. For example O3 with MAPE 60 - 130%
The best mean prediction we can see for 168 hours (1 week).**