Hi Kagglers!

In this month competition we are dealing with dataset which has time series, therefore the obvious choice would be an algorithm which can handle such data. In this notebook I'd like to create a model using LSTM, often referred as fancy RNN (Recursive Neural Network). Unlike ARIMA, RNNs are capable of learning nonlinearities, and specialized nodes like LSTM nodes are even better at this. First, in my EDA I'd like to show how to get insights using statistical methods from the dataset to help us choosing right algorythm and also what can be done (e.g. feature extraction, outliers) to improve a model.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


import scipy as sp

import tensorflow as tf
import tensorflow.keras.backend as K
print(f"Tensorflow version {tf.__version__}")

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
sample_df = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv")

## Datasets Overview 

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe().T

In [None]:
cols = ['deg_C', 'relative_humidity', 'absolute_humidity',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
sns.pairplot(data=train_df[cols])

### Correlation 

In [None]:
feat_cols_list = cols[:-3]
target_cols_list = cols[-3:] 

plt.figure(figsize=(16,15),dpi=300)
for i, target in enumerate(target_cols_list):
    temp_df = train_df[feat_cols_list].copy()
    temp_df[target] = train_df[target]
    cm = temp_df.corr()
    mask = np.zeros_like(cm, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True    
    plt.subplot(2,2, i+1)
    sns.heatmap(cm,mask=mask,square=True,cmap='coolwarm',linewidths=0.1, annot=True, cbar=False)
    plt.title(f"Features correlation to {target}")
    plt.tight_layout()


In [None]:
!pip install joypy
import joypy

### Distribution of target labels

In [None]:
color_list = ['b','g','r']
for i, target in enumerate(target_cols_list):
    joypy.joyplot(train_df,
                  column = target_cols_list[i],
                  figsize=(6,4),
                  legend=True,
                  color=color_list[i],
                 fade=0.3)
    plt.title(f"Distribution of {target}", fontsize=22)

In [None]:
plt.figure(figsize=(12,4))
for target in target_cols_list:
    sns.kdeplot(x=train_df[target],shade=True)
    plt.legend(target_cols_list)
    #plt.xlim(-10,250)

In [None]:
# Create fig and gridspec
fig = plt.figure(figsize=(16,10),dpi=80)
grid = plt.GridSpec(4,4, hspace=0.5,wspace=0.2)

# Define the axes
ax_main = fig.add_subplot(grid[:-1,:-1])
ax_right = fig.add_subplot(grid[:-1,-1], xticklabels=[],yticklabels=[])
ax_bottom = fig.add_subplot(grid[-1,0:-1],xticklabels=[],yticklabels=[])

# Scatterplot on main ax
ax_main.scatter(x='sensor_2', y='target_nitrogen_oxides',data=train_df,alpha=.4,cmap="coolwarm")

# Boxplot on the right
ax_right.boxplot(x=train_df['sensor_2'])
plt.xlabel("Sensor_2")

# boxplot on the bottom
ax_bottom.boxplot(x=train_df['target_nitrogen_oxides'],vert=False,)

# Decorations
ax_main.set(title='Scatterplot with Boxplot \n sensor_2 vs. target_nitrogen_oxides', ylabel='target_nitrogen_oxides');

In [None]:
# Create fig and gridspec
fig = plt.figure(figsize=(16,12),dpi=200)
grid = plt.GridSpec(4,4, hspace=0.5,wspace=0.2)

# Define the axes
ax_main = fig.add_subplot(grid[:-1,:-1])
ax_right = fig.add_subplot(grid[:,-1])
ax_bottom = fig.add_subplot(grid[-1,0:-1])

# Boxplot on main ax
sensors = ['sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']
sns.boxplot(data=train_df[sensors],ax=ax_main, palette='coolwarm')


# Boxplot on the right
sns.boxplot(data=train_df['absolute_humidity'],palette='coolwarm',ax=ax_right)
ax_right.set_xlabel("absolute_humidity")
# boxplot on the bottom
sns.boxplot(data=train_df[['deg_C','relative_humidity']],ax=ax_bottom,palette='coolwarm')

# Decorations
ax_main.set(title='Sensors features boxplot');

We also have datatime column, so we can check behavior of our features over time.Let's check our target columns first.In order to do this we need to convert datetime column form string into datetime object.


In [None]:
train_df['date_time'] = pd.to_datetime(train_df['date_time'])
test_df['date_time'] = pd.to_datetime(test_df['date_time'])

In [None]:
color_list = ['blue','red','green']
plt.figure(figsize=(16,10), dpi=150)
for i,target in enumerate(target_cols_list):
    plt.subplot(3,1,i+1)
    sns.lineplot(x=train_df['date_time'], y=train_df[target_cols_list[i]], color=color_list[i])

In [None]:

"""
This plot was copied from TPS July 2021 EDA created by Sharlto Cope.
"""
plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10,8), facecolor='#f6f5f5')
gs = fig.add_gridspec(8, 1)
gs.update(wspace=0, hspace=1.5)

background_color = "#f6f5f5"

run_no = 0
for row in range(0, 8):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1

run_no = 0
for col in feat_cols_list:
    sns.lineplot(ax=locals()["ax"+str(run_no)], y=train_df[col], x=pd.to_datetime(train_df['date_time']), color='#fcd12a')
    sns.lineplot(ax=locals()["ax"+str(run_no)], y=test_df[col], x=pd.to_datetime(test_df['date_time']), color='#287094')
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=5, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=5, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.7)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.7)
    spring = np.arange(np.datetime64("2010-03-10"), np.datetime64("2010-06-02"))
    locals()["ax"+str(run_no)].fill_between(spring, np.max(train_df[col]), color='#ff69b4', alpha=0.2, zorder=2, linewidth=0)
    summer = np.arange(np.datetime64("2010-06-01"), np.datetime64("2010-09-02"))
    locals()["ax"+str(run_no)].fill_between(summer, np.max(train_df[col]), color='#fcd12a', alpha=0.2, zorder=2, linewidth=0)
    autumn = np.arange(np.datetime64("2010-09-01"), np.datetime64("2010-12-02"))
    locals()["ax"+str(run_no)].fill_between(autumn, np.max(train_df[col]), color='#ff9200', alpha=0.2, zorder=2, linewidth=0)
    winter = np.arange(np.datetime64("2010-12-01"), np.datetime64("2011-03-02"))
    locals()["ax"+str(run_no)].fill_between(winter, np.max(train_df[col]), color='#287094', alpha=0.2, zorder=2, linewidth=0)
    spring_2 = np.arange(np.datetime64("2011-03-01"), np.datetime64("2011-04-05"))
    locals()["ax"+str(run_no)].fill_between(spring_2, np.max(train_df[col]), color='#ff69b4', alpha=0.2, zorder=2, linewidth=0)
    run_no += 1
    
ax0.text(14660, 80, 'Time Series', fontsize=8, fontweight='bold')
ax0.text(14660, 65, 'Showing time series data starting from train dataset followed by test dataset', fontsize=5)
fig.legend(['test', 'train'], ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=4, bbox_to_anchor=(0.2, 0.895))

plt.show()

### Skewness and Kurtosis

![](https://www.researchgate.net/profile/Attila-Bonyar/publication/298415862/figure/fig1/AS:340236723867648@1458130164255/Illustration-of-the-skewness-and-kurtosis-values-and-how-they-correlate-with-the-shape-of.png)

In [None]:
from scipy.stats import kurtosis, skew

def skew_and_kurtosis_table(df):
    sk_dict = {"Skewness": skew(df),
               "Kurtosis": kurtosis(df)}
    
    sk_df = pd.DataFrame(sk_dict, index=df.columns).style.background_gradient(subset=["Skewness", "Kurtosis"])
    return sk_df

# Create a table for training dataset
skew_and_kurtosis_table(train_df.iloc[:,1:])

In [None]:
# Create a table for test dataset
skew_and_kurtosis_table(test_df.iloc[:,1:])

### Variance Inflation Factor

A variance inflation factor(VIF) detects multicollinearity in regression analysis. Multicollinearity is when there’s correlation between predictors (i.e. independent variables) in a model.The VIF estimates how much the variance of a regression coefficient is inflated due to multicollinearity in the model.VIFs are calculated by taking a predictor, and regressing it against every other predictor in the model. 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

vif = pd.DataFrame()

# Normalize data first
sc = StandardScaler()
scaled_train_df = sc.fit_transform(train_df[feat_cols_list])
scaled_test_df = sc.transform(test_df[feat_cols_list])

vif['variables'] = feat_cols_list
vif['vif_train'] = [variance_inflation_factor(scaled_train_df,i) for i in range(train_df[feat_cols_list].shape[1])]
vif['vif_test'] = [variance_inflation_factor(scaled_test_df,i) for i in range(test_df[feat_cols_list].shape[1])]
vif.style.background_gradient(cmap='magma')

In general, a VIF above 10 indicates high correlation and is cause for concern. Some authors suggest a more conservative level of 2.5 or above. Sometimes a high VIF is no cause for concern at all. In this case our vif's for every features is in most cases lower than 10. There is one feature 'sensor_2' in training set which have 12 and two features ('absolute_humidity', 'sensor_4') in test dataset which have 45 and 46 respectively. Feature 'deg_C' and 'sensor_2' in test dataset are also high.One way of dealing with multicollinearity is using PCA to reduce number of features or simple delete one of the features.

In [None]:
!pip install regressors

### P-value and t-value

In [None]:
from sklearn.linear_model import LinearRegression
from regressors import stats

def stats_summary(df):
    ols_dict = {}
    X = train_df[feat_cols_list]
    for i, target in enumerate(target_cols_list):
        ols = LinearRegression()
        ols.fit(X,df[target_cols_list[i]])
        ols_dict[target] = ols
        # to print summary table:
        print(f"\n========== SUMMARY STATISTICS of TRAIN DATASET to {target_cols_list[i].upper()} ==============")
        stats.summary(ols, X, train_df[target_cols_list[i]], feat_cols_list)
    return ols_dict
        
ols_models_dict = stats_summary(train_df)

In [None]:
print(f"======= {target_cols_list[2]}=========")
ols = LinearRegression()
ols.fit(train_df[feat_cols_list], train_df[target_cols_list[2]])
pd.DataFrame(stats.coef_pval(ols, train_df[feat_cols_list], train_df[target_cols_list[2]])[:-1],
                 index=feat_cols_list,columns=['P-value']).style.background_gradient(cmap='magma_r')

🔑**Note:** The p-value for each independent variable tests the null hypothesis that the variable has no correlation with the dependent variable.A small p-value is an indication that the null hypothesis is false. It is good practice to decide in advance of the test how small a p-value is required to reject the test. P-value and t-value are inextricably linked. T-value measures the size of the difference relative to the variation in your sample data. The greater the magnitude of T, the greater the evidence against the null hypothesis. This means there is greater evidence that there is a significant difference. The closer T is to 0, the more likely there isn't a significant difference.


The regression output for target_nitrogen_oxides shows that variables are statistically significant because their p-values equal 0.000. On the other hand 'sensor_1' is not statistically significant because its p-value (0.99) is greater than the usual significance level of 0.05 and we might to consider to drop this column.

### Residuals

In [None]:
def plot_residuals(models_dict, df, feats, targets):
    plt.figure(figsize=(18,5))
    for i in range(len(target_cols_list)):
        # Calculate predicions
        model = models_dict[targets[i]]
        y_pred = model.predict(df[feats])
        residuals =  pd.Series(df[targets[i]] - y_pred, name=f"residuals_1")
        # Plot scatterplot
        plt.subplot(1, len(target_cols_list), i+1)
        sns.scatterplot(x=df[targets[i]], y=residuals)
        plt.axhline(y=0,color='red',linestyle='--')

In [None]:
plot_residuals(models_dict= ols_models_dict, 
               df= train_df,
               feats= feat_cols_list,
               targets= target_cols_list)

Regression residuals are actually estimates of the true error( y_true - y_prediction), just like the regression coefficients are estimates of the true population coefficients.Using residual plots, you can assess whether the observed error (residuals) is consistent with stochastic error (a fancy word for random):
 1. The residuals should fall in a symmetrical pattern and have a constant spread throughout the range. 
 2. The non-random pattern in the residuals indicates that the deterministic portion (predictor variables) of the model is not capturing some explanatory information that is “leaking” into the residuals. Possibilities include:
     - A missing variable
     - A missing higher-order term of a variable in the model to explain the curvature
     - A missing interaction between terms already in the model
     
Identifying and fixing the problem so that the predictors now explain the information that they missed before should produce a good-looking set of residuals!

In addition to the above, here are two more specific ways that predictive information can sneak into the residuals:
- The residuals should not be correlated with another variable. 
- Adjacent residuals should not be correlated with each other (autocorrelation). If you can use one residual to predict the next residual, there is some predictive information present that is not captured by the predictors. Typically, this situation involves time-ordered observations. For example, if a residual is more likely to be followed by another residual that has the same sign, adjacent residuals are positively correlated. You can include a variable that captures the relevant time-related information, or use a time series analysis. In regression, you can perform the Durbin-Watson test to test for autocorrelation.

In [None]:
def plot_residuals_dist(models_dict, df, feats, targets):
    plt.figure(figsize=(18,4))
    for i in range(len(target_cols_list)):
        model = models_dict[targets[i]]
        y_pred = model.predict(df[feats])
        residuals =  pd.Series(df[targets[i]] - y_pred, name="residuals")
        plt.subplot(1, len(target_cols_list), i+1)
        sns.histplot(x=residuals,bins=40, color='red')
        plt.title(f"{targets[i]}")
        
plot_residuals_dist(models_dict=ols_models_dict,
                    df=train_df,
                    feats=feat_cols_list, 
                    targets=target_cols_list)

In [None]:
def probability_plot(models_dict,df,feats,targets):
    plt.figure(figsize=(20,4))
    for i,target in enumerate(targets):
        model = models_dict[targets[i]]
        y_pred = model.predict(df[feats])
        residuals =  pd.Series(df[targets[i]] - y_pred, name="residuals_1")
        ax = plt.subplot(1,3,i+1)
        _ = sp.stats.probplot(residuals, plot=ax);
        plt.title(f"{targets[i]} probability plot")
        
probability_plot(models_dict=ols_models_dict,
                 df = train_df,
                 feats=feat_cols_list,
                 targets=target_cols_list)

The probability plot is a graphical technique for assessing whether or not a data set follows a given distribution such as the normal. The data are plotted against a theoretical distribution in such a way that the points should form approximately a straight line. Departures from this straight line indicate departures from the specified distribution.
It indicates that your distribution has:
- Right Skew - If the plotted points appear to bend up and to the left of the normal line that indicates a long tail to the right. 
- Left Skew - If the plotted points bend down and to the right of the normal line that indicates a long tail to the left.

### Check stationarity of a Time Series

I fill like there is one more thing I need go add to complete my EDA. Most of the Time Series Models(let's call it TS) work on assumption that TS is stationary. TS said to be stationary if its statistical properties such mean, variance remain constant over time.Intuitively, we can said that if a TS has a particular behaviour over time, there is a very high probability that it will follow the same in the future. Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series.

Stationarity is defined using very strict criterion. However, for practical purposes we can assume the series to be stationary if it has constant statistical properties over time, ie. the following:

- constant mean
- constant variance
- an autocovariance that does not depend on time.

More formally, we can check stationarity using the following:

1. Plotting Rolling Statistics: We can plot the moving average or moving variance and see if it varies with time, this is more of a visual technique.
2. Dickey-Fuller Test: This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

For more information and the ways to make series stationary visit: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/

In principle we do not need to check for stationarity nor correct for it when we are using an LSTM.The LSTM method is preferable over other existing algorithms as LSTM network is able to learn non-linear and non-stationary nature of a time series which reduces error in forecasting. However, if the data is stationary, it will help with better performance and make it easier for the neural network to learn. So to finished my EDA I will performe this test.

In [None]:
# Feature engineering
train_df['year'] = train_df['date_time'].dt.year
train_df['month'] = train_df['date_time'].dt.month
train_df['hour'] = train_df['date_time'].dt.hour
train_df['day'] = train_df['date_time'].dt.day

test_df['year'] = test_df['date_time'].dt.year
test_df['month'] = test_df['date_time'].dt.month
test_df['hour'] = test_df['date_time'].dt.hour
test_df['day'] = test_df['date_time'].dt.day

In [None]:
from statsmodels.tsa.stattools import adfuller

#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(train_df['sensor_2'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
dfoutput

How  we can interpret these results?

The **null hypothesis (H0)** of the test is that the time series can be represented by a unit root, that it **is not stationary** (has some time-dependent structure). The **alternate hypothesis (H1)** (rejecting the null hypothesis) is that the time series **is stationary**.

We interpret this result using the p-value from the test. A p-value below a threshold (such as 5% or 1%) suggests we reject the null hypothesis (stationary), otherwise a p-value above the threshold suggests we fail to reject the null hypothesis (non-stationary).

The more negative this statistic, the more likely we are to reject the null hypothesis (we have a stationary dataset).We can see that our statistic value of -9 is less than the value of -3.431 at 1% .This suggests that we can reject the null hypothesis with a significance level of less than 1% (i.e. a low probability that the result is a statistical fluke).Rejecting the null hypothesis means that the process has no unit root, and in turn that the time series is stationary or does not have time-dependent structure.

### Rolling Statistics for Training and Test Dataset

In [None]:
def rolling_statistics(df, f_names):
    """
    This function plots roilling statistics for a given feature.
    """
    
    if f_names == target_cols_list:
        size = (25,15)
    else:
        size = (25,25)
        
    for i,col in enumerate(f_names):
        roll_mean = df[col].rolling(window=24).mean()
        roll_std = df[col].rolling(window=24).std()
        
        plt.figure(figsize=size)
        plt.subplot(len(f_names),1, i+1)
        sns.lineplot(x='date_time', y=col, data=df, label=col)
        sns.lineplot(x='date_time', y=roll_mean, data=df, label='roll_mean')
        sns.lineplot(x='date_time', y=roll_std, data=df, label='roll_std')
        plt.title(f"Rolling statistics for '{col}' feature")
        plt.legend(loc='center right', bbox_to_anchor=(1.10,0.5));

#### Training datasets rolling statistics

In [None]:
rolling_statistics(df=train_df, 
                   f_names=feat_cols_list)

#### Test dataset rollinig statistics

In [None]:
rolling_statistics(df=test_df,
                   f_names=feat_cols_list)

#### Rolling statistics for targets columns

In [None]:
rolling_statistics(df=train_df,
                   f_names=target_cols_list)

## Modeling Neural Network with LTSM

 Because of the technical problems with runnning this notebook (I think this is RAM issue) I can not finish everything what I intended to do in one notebook. I had to start a new notebook where I will complete modelling part. This is the link to part II of this notebook : https://www.kaggle.com/godzill22/tps-07-simply-rnn-and-lstm