In [None]:
import os
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
mpl.rcParams['figure.figsize'] = (15, 7)
mpl.rcParams['axes.grid'] = False

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 256)

from pandas.core.common import SettingWithCopyWarning

import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
from dateutil.parser import parse

date_parser = lambda date: parse(date)

# **Data Loading**

In [None]:
df = pd.read_csv('../input/sunspots/Sunspots.csv',
                 usecols=['Date', 'Monthly Mean Total Sunspot Number'], 
                 parse_dates=['Date'],
                 date_parser=date_parser)
df.rename(columns={'Monthly Mean Total Sunspot Number': 'Monthly_Average_Sunspot'}, inplace=True)
display(df.head())

In [None]:
df.info(verbose=True)

# **Data Analysis**

In [None]:
eda = df.copy()

In [None]:
eda['Month'] = eda.Date.dt.month
eda['Year_Id'] = [int(str(i)[3]) for i in (eda.Date.dt.year)] 
eda['Year_Id'].replace(0, 10, inplace=True)
eda.head()

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(20,15), dpi=80)
titles = ['Year-wise Trend', 'Month-wise Seasonality', 'Decade-wise Seasonality']
cols = ['Date', 'Month', 'Year_Id']

for col, axis, title in zip(cols, axes, titles):
    sns.boxplot(x=col, y='Monthly_Average_Sunspot', data=eda, ax=axis)
    axis.set_title(title, fontsize=14)
    
fig.tight_layout()
plt.show()

Eplanation of above plot:
* The distribution of data is almost same in each month with few outliers
* The distribution of data among each year of the decades are not same.

## **Data Visualization**

In [None]:
df = df.set_index('Date')
df.plot(grid=True)

In [None]:
df_2k = df.loc['2000':'2010'] 
df_2k.plot(figsize=(16,7), grid=True)

In [None]:
import plotly.express as px  

fig = px.line(eda, x='Date', y='Monthly_Average_Sunspot', title='Mean_Sunspot_Slider')
fig.update_xaxes(
    rangeslider_visible=False,
    rangeselector=dict(
        buttons=[
            dict(count=10, label="10y", step="year", stepmode="backward"),
            dict(count=20, label="20y", step="year", stepmode="backward"),
            dict(count=30, label="30y", step="year", stepmode="backward"),
            dict(count=40, label="40y", step="year", stepmode="backward"),
            dict(count=50, label="50y", step="year", stepmode="backward"),
            dict(step="all"),
        ]
    )
)
fig.show()

### Period == 11 years ???

In [None]:
YEAR_ORG = 1869
YEAR_1, YEAR_2, YEAR_3 = YEAR_ORG, YEAR_ORG+11, YEAR_ORG+22
df_1 = eda[(eda.Date.dt.year>=YEAR_1) & (eda.Date.dt.year<YEAR_2)]
df_2 = eda[(eda.Date.dt.year>=YEAR_2) & (eda.Date.dt.year<YEAR_3)]

x = np.arange(1, len(df_1['Date'])+1)

plt.plot(x, df_1['Monthly_Average_Sunspot'], label=f'{YEAR_1}-{YEAR_2}')
plt.plot(x, df_2['Monthly_Average_Sunspot'], label=f'{YEAR_2}-{YEAR_3}')
plt.legend()
plt.xlabel('Month')
plt.ylabel('Monthly_Average_Sunspot')
plt.title('Comparison of 2 consecutive 11-year period of data')
plt.show()

In [None]:
fig = plt.figure(figsize=(18,6))
fig.subplots_adjust(hspace=0.4, wspace=0.2)

ax1 = fig.add_subplot(2,2,1)
pd.plotting.lag_plot(df['Monthly_Average_Sunspot'], lag=1)
plt.title('Lag_1')

ax2 = fig.add_subplot(2,2,2)
pd.plotting.lag_plot(df['Monthly_Average_Sunspot'], lag=3)
plt.title('Lag_3')

ax3 = fig.add_subplot(2,2,3)
pd.plotting.lag_plot(df['Monthly_Average_Sunspot'], lag=6)
plt.title('Lag_6')

ax3 = fig.add_subplot(2,2,4)
pd.plotting.lag_plot(df['Monthly_Average_Sunspot'], lag=24)
plt.title('Lag_24')

plt.show()

In [None]:
sns.distplot(df['Monthly_Average_Sunspot'], hist=True)

## **Stationarity Testing**

* **Null Hypothesis** - Series is **not stationary**
* **Alternate Hypothesis** - Series is **stationary**

### **Augumented Dickey-Fuller test**

In [None]:
from statsmodels.tsa.stattools import adfuller

samples = df['Monthly_Average_Sunspot']
df_test = adfuller(samples, autolag='AIC')
df_output = pd.Series(df_test[0:4], index=['Statistics', 'p_value', '#Lags','#Observations'])
for key, value in df_test[4].items():
    df_output[key] = value
display(df_output)

# Comparing with 5% significant Level
if df_output['Statistics'] < df_output['5%']:  
    print('\n--> Series is stationary')
else:
    print('\n--> Series is not Stationary')
if df_output[1] > 0.05 :
    print('\n--> Series is not Stationary')
else:
    print('\n--> Series is Stationary')

### **KPSS test**

In [None]:
from statsmodels.tsa.stattools import kpss

stats, p, lags, critical_values = kpss(df['Monthly_Average_Sunspot'], 'c', nlags='legacy')
print(f'Test Statistics: {stats}')
print(f'p-value: {p}')
print(f'Critial Values: {critical_values}')

if p < 0.05 :
    print('Series is not Stationary')
else:
    print('Series is Stationary')

#### Note: 

For Non-Stationary data: First make it stationary

* Differencing, 
* Taking log and Differencing, 
* Decompostion in components,
* Detrending

## ACF / PACF

* Running the example creates a 2D plot showing the lag value along the x-axis and the correlation on the y-axis between -1 and 1.

* Confidence intervals are drawn as a cone. By default, this is set to a 95% confidence interval, suggesting that correlation values outside of this code are very likely a correlation and not a statistical fluke.

* acf: By looking at the plot we can improvise our understanding from above plot and say that present value depends on previous 25-30 values.

* pacf plot further says that present value depends only on previous 5/6 values. All these plots help us narrow down thinking and make our model efficient.

In [None]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df['Monthly_Average_Sunspot'].tolist(), lags=20, ax=axes[0])
plot_pacf(df['Monthly_Average_Sunspot'].tolist(), lags=20, ax=axes[1])

# **Modelling**

## **Exponential (Smoothing) Moving Average**

In [None]:
df['Monthly_Average_Sunspot'][:200].plot()
df['Monthly_Average_Sunspot'][:200].ewm(span=3, adjust=False, min_periods=3).mean().plot(label='Exponential Weighted Average')

## Here span=3 is provide thus α=2/(span+1) automatically calculated and applied
##     https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
plt.title('Exponential Weighted M.A.')
plt.legend()
plt.show()

In [None]:
df['Monthly_Average_Sunspot'][:200].plot()
df['Monthly_Average_Sunspot'][:200].ewm(alpha=0.69, adjust=False, min_periods=3).mean().plot(label='Exponential Weighted Average')

plt.title('Exponential Weighted M.A.')
plt.legend()
plt.show()

In [None]:
def wma(weights): 
    def calc(x):
        return (weights*x).mean()
    return calc

df.reset_index(inplace=True)
data = df[:200].copy()
data['Rolling mean'] = df['Monthly_Average_Sunspot'][:200].rolling(3).mean()
data['W_M_A'] = df['Monthly_Average_Sunspot'][:200].rolling(window=3).apply(wma(np.array([0.5,1,1.5])))
data['E_W_A'] = df['Monthly_Average_Sunspot'][:200].ewm(span=3, adjust=False, min_periods=0).mean()
data['E_S_M_A'] = df['Monthly_Average_Sunspot'][:200].ewm(alpha=0.7, adjust=False, min_periods=3).mean()
data.set_index('Date', inplace=True)
data.plot()
plt.show()

In [None]:
def calculate_RMSE(df, col_indices=[]):
    col_names = list(df.columns)
    errors = dict()
    for idx in col_indices:
        rmse = np.sqrt(np.sum((df.iloc[:,0]-df.iloc[:,idx])**2))
        errors[col_names[idx]] = rmse
    return errors

calculate_RMSE(data, [1,2,3,4])

## **Decomposition**

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Trend is repeated after every 11 years, but Data is monthly recorded
seasonal_frequency = 11 * 12
result = seasonal_decompose(df['Monthly_Average_Sunspot'], model="additive", period=seasonal_frequency) 
result.plot()
plt.show()

In [None]:
# total_sum = result.trend + result.seasonal + result.resid
# total_sum[:100]

pd.DataFrame(result.observed-result.trend).plot(label='Trend Decomposition')
plt.show()