# Importing libraries

In [None]:
# Dataset manipulation modules
import numpy as np
import pandas as pd

# Iteration and naming tools
import re
from itertools import compress, product

# Plot tools
import matplotlib.pyplot as plt
import seaborn as sns

# P-values, Z-scores calculation tools
import scipy

# Linear Regression tools
import statsmodels.api as sm

# Error calculation
import tensorflow as tf

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

# Data Prep

In [None]:
# Reading CSV
df = pd.read_csv("../input/calculated-aqi-caaqm-central-university-hyd/Raw_DATA.csv")

# Removing the columns that are not useful
df.drop(["To Date"], axis=1, inplace=True)

# Renaming Date column
df.rename({"From Date":"Date"}, axis=1, inplace=True)

# Changing date column into datetime object
df['Date'] = pd.to_datetime(df['Date'], dayfirst= True)

# changing columns to numeric values
for element in df.columns[1:]:
    df[element]= pd.to_numeric(df[element], errors='coerce')

df.head(5)

In [None]:
# Sub-Index calculation functions (as per Indian Air Quality Standards)

# PM2.5
def SI_PM_25(x):
    SI = 0
    
    if pd.isna(x):
        SI = x
    elif x<=30: 
        SI = x*50/30 
    elif x>30 and x<=60:
        SI = 50+((x-30)*50/30) 
    elif x>60 and x<=90:
        SI = 100+((x-60)*100/30) 
    elif x>90 and x<=120:
        SI = 200+((x-90)*100/30)
    elif x>120 and x<=250:
        SI = 300+((x-120)*100/130)
    elif x>250:
        SI = 400+((x-250)*100/130)
    else:
        SI = x
    
    return SI

# PM10
def SI_PM_10(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=50 :
        SI = x
    elif x>50 and x<=100:
        SI= x
    elif x>100 and x<=250:
        SI = 100+((x-100)*100/150)
    elif x>250 and x<=350:
        SI = 200+(x-250)
    elif x>350 and x<=430:
        SI = 300+((x-350)*100/80)
    elif x>430:
        SI = 400+((x-430)*100/80)
    else:
        SI = x
    
    return SI

# NO2
def SI_NO2(x):
    SI =0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=180:
        SI = 100+((x-80)*100/100)
    elif x>180 and x<=280:
        SI = 200+((x-180)*100/100)
    elif x>280 and x<=400:
        SI = 300+((x-280)*100/120)
    elif x>400:
        SI = 400+((x-400)*100/120)
    else:
        SI = x
    
    return SI

# NH3
def SI_NH3(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=200:
        SI = x*50/200
    elif x>200 and x<=400:
        SI = 50+((x-200)*50/200)
    elif x>400 and x<=800:
        SI = 100+((x-400)*100/400)
    elif x>800 and x<=1200:
        SI = 200+((x-800)*100/400)
    elif x>1200 and x<=1800:
        SI = 300+((x-1200)*100/600)
    elif x>1800:
        SI = 400+((x-1800)*100/600)
    else:
        SI = x
    
    return SI

# SO2
def SI_SO2(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=40:
        SI = x*50/40
    elif x>40 and x<=80:
        SI = 50+((x-40)*50/40)
    elif x>80 and x<=380:
        SI = 100+((x-80)*100/300)
    elif x>380 and x<=800:
        SI = 200+((x-380)*100/420)
    elif x>800 and x<=1600:
        SI = 300+((x-800)*100/800)
    elif x>1600:
        SI =400+((x-1600)*100/800)
    else:
        SI = x
    
    return SI

# CO
def SI_CO(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=1:
        SI = x*50/1
    elif x>1 and x<=2:
        SI = 50+((x-1)*50/1) 
    elif x>2 and x<=10:
        SI = 100+((x-2)*100/8) 
    elif x>10 and x<=17:
        SI = 200+((x-10)*100/7)
    elif x>17 and x<=34:
        SI = 300+((x-17)*100/17)
    elif x>34:
        SI = 400+((x-34)*100/17)
    else:
        SI = x
    
    return SI

# OZONE
def SI_OZONE(x):
    SI=0
    
    if pd.isna(x):
        SI = x
    elif x<=50:
        SI = x*50/50 
    elif x>50 and x<=100:
        SI = 50+((x-50)*50/50)
    elif x>100 and x<=168:
        SI = 100+((x-100)*100/68)
    elif x>168 and x<=208:
        SI = 200+((x-168)*100/40)
    elif x>208 and x<=748:
        SI = 300+((x-208)*100/539)
    elif x>748:
        SI = 400+((x-400)*100/539)
    else:
        SI = x
    
    return SI

In [None]:
# Initiating a Dataframe with dates
subIndex = pd.DataFrame(df['Date'])

# Calculating Sub-Indices using functions defined in above code block.
subIndex['PM2.5 SI'] = df["PM2.5"].apply(SI_PM_25)
subIndex['PM10 SI'] = df["PM10"].apply(SI_PM_10)
subIndex['NO2 SI'] = df["NO2"].apply(SI_NO2)
subIndex['NH3 SI'] = df["NH3"].apply(SI_NH3)
subIndex['SO2 SI'] = df["SO2"].apply(SI_SO2)
subIndex['CO SI'] = df["CO"].apply(SI_CO)
subIndex['Ozone SI'] = df["Ozone"].apply(SI_OZONE)

subIndex.head(5)

In [None]:
# AQI is calculated as per Indian AQI calculation standards

# Initiating Dataframe with Dates
aqi = pd.DataFrame(df['Date'])

# creating an empty AQI row to fill in
Nan = np.nan
aqi['AQI']=Nan

# iterating through rows # calculating AQI
for ind in subIndex.index:

    # checking either one of PM2.5 or PM10 is available
    while pd.notna(subIndex['PM2.5 SI'][ind]) or pd.notna(subIndex['PM10 SI'][ind]):

        p_list = [subIndex['PM2.5 SI'][ind], subIndex['PM10 SI'][ind], subIndex['NO2 SI'][ind], subIndex['NH3 SI'][ind], subIndex['SO2 SI'][ind], subIndex['CO SI'][ind],subIndex['Ozone SI'][ind]]

        # Check_list of bool of available pollutant values in each index or date
        Check_list = list(pd.notna(p_list))

        # checksum of available Pollutant values is used in if statement
        if sum(Check_list) >=3:
            
            # compresSub-Indexng the list of pollutants to remove Nan to remove uncertainities 
            aqi['AQI'][ind]= max(list(compress(p_list, Check_list)))

        break

aqi.head(5)

In [None]:
# converting subIndex to set freq of days for resampling
df = df.set_index('Date').asfreq('D', method="ffill")

# converting subIndex to set freq of days for resampling
subIndex = subIndex.set_index('Date').asfreq('D', method="ffill")

# converting AQI to set freq of days for resampling
aqi = aqi.set_index('Date').asfreq('D', method="ffill")

df_list = [df, subIndex, aqi]

# Dropping year 2017 Data as it is inconsistent with other years.
for itr in df_list:
    index = itr.loc["2017"].index
    itr.drop(index, inplace=True)

In [None]:
# removing outliers in AQI data to perform regression
aqi = aqi.dropna()
z_scores = scipy.stats.zscore(aqi)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores<3).all(axis=1)
aqi = aqi[filtered_entries]

In [None]:
# Splitting to train and test
train, test = train_test_split(aqi, test_size=0.1, shuffle=False)

train = train.asfreq('D')
test= test.asfreq('D')

# Exploring AQI Data

## Seasonality

In [None]:
# daily , weekly, semi-monthly, monthly, quaterly, yearly .
plt.figure(figsize=[15,7],)
plt.suptitle("Finding pattern using visual Inspection for Seasonality")

# daily
aqi.plot(ax=plt.subplot(321), legend=False)
plt.title("daily Plot")
plt.ylabel("AQI VALUE")

# weekly
ax=plt.subplot(322)
aqi[:30].resample('W').plot(ax=ax)
plt.title("Weekly Plot")
plt.ylabel("AQI VALUE")

# semi-monthly
ax=plt.subplot(323)
ax = aqi[:90].resample('sm').plot()
plt.title("Semi-monthly Plot")
plt.ylabel("AQI VALUE")

# monthly
ax=plt.subplot(324)
ax = aqi[:90].resample('m').plot()
plt.title("Monthly Plot")
plt.ylabel("AQI VALUE")

# quaterly
ax=plt.subplot(325)
ax = aqi[:356].resample('Q').plot()
plt.title("Quaterly Plot")
plt.ylabel("AQI VALUE")

# yearly
ax=plt.subplot(326)
ax = aqi.resample('Y').plot()
plt.title("Yearly Plot")
plt.ylabel("AQI VALUE")

plt.tight_layout()
plt.show()

**We can't see any possible evidence for seasonality in daily, weekly, semi-monthly, monthly and quaterly plots. but, there is a slight evidence in yearly plot. But it can't hold out in stationarity test as data contains only 3 years of data; we can say that data has some seasonality but it can't be detected with itself as there is no enough evidence.**

## Stationarity

In [None]:
plt.figure(figsize=[15,7])
sm.tsa.seasonal_decompose(aqi,period=356).plot()
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(aqi)[1])
plt.show()

**As per the p-value from Dickey-fuller test by rejecting null hypothesis(i.e. Non-stationary) by significance value (= 0.05), we can say that the time series is stationary.**

# ARIMA

## Parameters for modeling

In [None]:
# Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
plt.figure(figsize=(15,7))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(aqi, lags=48, ax=ax)
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(aqi, lags=48, ax=ax)
plt.tight_layout()
plt.show()

## BEST MODEL

In [None]:
# Initial approximation of parameters
q = range(0, 5)
p = range(0, 5)
d=0

parameters = product(p, q)
parameters_list = list(parameters)
len(parameters_list)

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model=sm.tsa.statespace.SARIMAX(train, order=(param[0], d, param[1])).fit(disp=-1)
    except ValueError:
        print('wrong parameters:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])

In [None]:
# Best Models
result_table = pd.DataFrame(results)
result_table.columns = ['parameters (p,q)', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
print(best_model.summary())

## Prediction and Diagnostics of Residuals

In [None]:
plt.figure(figsize=(15,7))
plt.plot(train[:100])
plt.plot(best_model.predict()[:100])
plt.legend(["True value", "Predicted value from best model"])

In [None]:
best_model.plot_diagnostics(figsize=(18, 8))
plt.show()

## Forecast

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))

# Plot the data (here we are subsetting it to get a better look at the forecasts)
aqi[1000:].plot(ax=ax)

# Construct the forecasts
fcast = best_model.get_forecast("2021-05-26").summary_frame()
fcast['mean'].plot(ax=ax, style='k--')
ax.fill_between(fcast.index, fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1)

## Error

In [None]:
mae = tf.keras.metrics.mean_absolute_error(y_pred=fcast['mean'].values, y_true=test.AQI.values).numpy()
rmse = tf.sqrt(tf.losses.mean_squared_error(y_pred=fcast['mean'].values, y_true=test.AQI.values)).numpy()
print("MAE: ",mae)
print("RMSE: ",rmse)