In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt 
from colorama import Fore
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

import warnings # Supress warnings 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

I will build model to predict the depth to groundwater of an aquifer located in Petrignano, Italy.
The question I want to ask is :
* What is the future depth to groundwater of a well belonging to the aquifier in Petrigrano over the next quarter?
* What do features influence the water availability of in the aquifer?

Petrignano Aquifer Description: The wells field of the alluvial plain between Ospedalicchio di Bastia Umbra and Petrignano is fed by three underground aquifers separated by low permeability septa. The aquifer can be considered a water table groundwater and is also fed by the Chiascio river. The groundwater levels are influenced by the following parameters: rainfall, depth to groundwater, temperatures and drainage volumes, level of the Chiascio river.

In [None]:
df = pd.read_csv("../input/acea-water-prediction/Aquifer_Petrignano.csv")
df.head()

In [None]:
from datetime import datetime, date 

df['Date'] = pd.to_datetime(df.Date, format = '%d/%m/%Y')
df.head()

In [None]:
print('The earliest date: \t%s' %df['Date'].values[[0, -1]][0])
print('The latest date: \t%s' %df['Date'].values[[0, -1]][1])
print(df.shape)

In [None]:
feature_col=list(df.columns[1:])
feature_col

# 1 Data Visullization #

In [None]:
f, ax = plt.subplots(nrows=len(feature_col)-2, ncols=1, figsize=(15, 25))
    
sns.lineplot(x=df['Date'].where(df['Rainfall_Bastia_Umbra'].notnull()), y=df['Rainfall_Bastia_Umbra'].fillna(np.inf), ax=ax[0], color='dodgerblue')
ax[0].set_title('Feature: {}'.format('Rainfall_Bastia_Umbra'), fontsize=14)
ax[0].set_ylabel(ylabel='Rainfall_Bastia_Umbra', fontsize=14) 

sns.lineplot(x=df['Date'].where(df['Depth_to_Groundwater_P24'].notnull()), y=df['Depth_to_Groundwater_P24'].fillna(np.inf), ax=ax[1], color='dodgerblue',label='Depth_to_Groundwater_P24')
sns.lineplot(x=df['Date'].where(df['Depth_to_Groundwater_P25'].notnull()), y=df['Depth_to_Groundwater_P25'].fillna(np.inf), ax=ax[1], color='orange',label='Depth_to_Groundwater_P25')
ax[1].set_title('Feature: {}'.format('Depth_to_Groundwater'), fontsize=14)
ax[1].set_ylabel(ylabel='Depth_to_Groundwater', fontsize=14)

sns.lineplot(x=df['Date'].where(df['Temperature_Bastia_Umbra'].notnull()), y=df['Temperature_Bastia_Umbra'].fillna(np.inf), ax=ax[2], color='dodgerblue',label='Temperature_Bastia_Umbra')
sns.lineplot(x=df['Date'].where(df['Temperature_Petrignano'].notnull()), y=df['Temperature_Petrignano'].fillna(np.inf), ax=ax[2], color='orange',label='Temperature_Petrignano')
ax[2].set_title('Feature: {}'.format('Temperature'), fontsize=14)
ax[2].set_ylabel(ylabel='Temperature', fontsize=14)

sns.lineplot(x=df['Date'].where(df['Volume_C10_Petrignano'].notnull()), y=df['Volume_C10_Petrignano'].fillna(np.inf), ax=ax[3], color='dodgerblue')
ax[3].set_title('Feature: {}'.format('Volume_C10_Petrignano'), fontsize=14)
ax[3].set_ylabel(ylabel='Volume_C10_Petrignano', fontsize=14)

sns.lineplot(x=df['Date'].where(df['Hydrometry_Fiume_Chiascio_Petrignano'].notnull()), y=df['Hydrometry_Fiume_Chiascio_Petrignano'].fillna(np.inf), ax=ax[4], color='dodgerblue')
ax[4].set_title('Feature: {}'.format('Hydrometry_Fiume_Chiascio_Petrignano'), fontsize=14)
ax[4].set_ylabel(ylabel='Hydrometry_Fiume_Chiascio_Petrignanoo', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))

corrmat = df[feature_col].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))
sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax,mask=mask)
ax.set_title('Correlation Matrix of Features', fontsize=16)

plt.show()

Temperature Petrignano and Temperature Bastia Umbra are identical. And Temperature Petrignano has missing values. So I will keep Temperature Bastia Umbra.

'Depth_to_Groundwater_P24' and 'Depth_to_Groundwater_P25' are also identical. For the simplicity, I keep 'Depth_to_Groundwater_P25' as target.

Drainage volume has the highest correlation with the target-Depth_to_Groundwater.

In [None]:
#Check missing values
NaN_values = (df[feature_col].isnull().sum() / len(df) * 100).sort_values(ascending = False)

fig, ax = plt.subplots(figsize = (10, 4))
plt.title('Petrignano Aquifer: NaN values (%)', size = 15, fontweight = 'bold', fontfamily = 'serif')
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
sns.barplot(x = NaN_values, y = NaN_values.index, edgecolor = 'black', ax = ax,
                palette = reversed(sns.color_palette("viridis", len(NaN_values.index))))
ax.set_xlim((0, 100))    
ax.set_xticklabels(range(0, 101, 20), fontfamily = 'serif')
ax.set_yticklabels(NaN_values.index, fontfamily = 'serif')
ax.grid(axis = 'x', linestyle = '--', alpha = 0.9)
plt.show()

In [None]:
# Reset date using rainfall data which has a shorter duration
df = df[df.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
# Drop one of the target columns, so we can focus on only one target
df = df.drop(['Depth_to_Groundwater_P24', 'Temperature_Petrignano'], axis=1)

In [None]:
# Simplify column names
df.columns = ['Date', 'Rainfall', 'Depth_to_Groundwater', 'Temperature', 'Drainage_Volume', 'River_Hydrometry']
targets = ['Depth_to_Groundwater']
features = [feature for feature in df.columns if feature not in targets]
df.head()

**Features**:

* Rainfall indicates the quantity of rain falling (mm)
* Temperature indicates the temperature (°C)
* Drainage_Volume indicates the volume of water taken from the drinking water treatment plant (m^3 )
* River_Hydrometry indicates the groundwater level (m)


**Target:**

Depth to Groundwater indicates the groundwater level (m from the ground floor)

In [None]:
print('The earliest date: \t%s' %df['Date'].values[0])
print('The latest date: \t%s' %df['Date'].values[-1])
print(df.shape)

# 2 Data Preprocessing

## 2.1 Chronological Order and Equidistant Timestamps
The data should be in **chronological order** and the **timestamps should be equidistant** in time series. 

In [None]:
df = df.sort_values(by='Date')

df['Time_Interval'] = df['Date'] - df['Date'].shift(1)

df[['Date', 'Time_Interval']].head()

In [None]:
print(f"{df['Time_Interval'].value_counts()}")
df = df.drop('Time_Interval', axis=1)

The time interval is one day and the data is already in chronological order. 

## 2.2 Handeling Missing Data

In [None]:
df.isna().sum()

In [None]:
f, ax = plt.subplots(nrows=2, ncols=1, figsize=(16, 16))

old_hydrometry = df['River_Hydrometry'].copy()
df['River_Hydrometry'] = df['River_Hydrometry'].replace(0, np.nan)

sns.lineplot(x=df['Date'], y=old_hydrometry, ax=ax[0], color='darkorange', label='missing')
sns.lineplot(x=df['Date'], y=df['River_Hydrometry'].fillna(np.inf), ax=ax[0], color='dodgerblue', label='modified')
ax[0].set_title('Feature: River_Hydrometry', fontsize=14)
ax[0].set_ylabel(ylabel='River_Hydrometry', fontsize=14)
ax[0].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

old_drainage = df['Drainage_Volume'].copy()
df['Drainage_Volume'] = df['Drainage_Volume'].replace(0, np.nan)

sns.lineplot(x=df['Date'], y=old_drainage, ax=ax[1], color='darkorange', label='missing')
sns.lineplot(x=df['Date'], y=df['Drainage_Volume'].fillna(np.inf), ax=ax[1], color='dodgerblue', label='modified')
ax[1].set_title('Feature: Drainage_Volume', fontsize=14)
ax[1].set_ylabel(ylabel='Drainage_Volume', fontsize=14)
ax[1].set_xlim([df['Date'].values[0], df['Date'].values[-1]])

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(16,5))
sns.heatmap(df.T.isna(), cmap='Blues')
ax.set_title('Fields with Missing Values', fontsize=16)
# for tick in ax.xaxis.get_major_ticks():
#    tick.label.set_fontsize(14) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
plt.show()

Filling NaNs with the interpolated values.


In [None]:
f, ax = plt.subplots(nrows=3, ncols=1, figsize=(15, 12))

sns.lineplot(x=df.Date, y=df.Drainage_Volume.interpolate(), ax=ax[0], color='darkorange', label = 'interpolate')
sns.lineplot(x=df.Date, y=df.Drainage_Volume.fillna(np.inf), ax=ax[0], color='dodgerblue', label = 'original')
ax[0].set_title(f'Feature: Drainage_Volume', fontsize=14)
ax[0].set_ylabel(ylabel='Drainage_Volume', fontsize=14)
ax[0].set_xlim([date(2019, 5, 1), date(2019, 10, 1)])

sns.lineplot(x=df.Date, y=df.River_Hydrometry.interpolate(), ax=ax[1], color='darkorange', label = 'interpolate')
sns.lineplot(x=df.Date, y=df.River_Hydrometry.fillna(np.inf), ax=ax[1], color='dodgerblue', label = 'original')
ax[1].set_title(f'Feature: River_Hydrometry', fontsize=14)
ax[1].set_ylabel(ylabel='River_Hydrometry', fontsize=14)
ax[1].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

sns.lineplot(x=df.Date, y=df.Depth_to_Groundwater.interpolate(), ax=ax[2], color='darkorange', label = 'interpolate')
sns.lineplot(x=df.Date, y=df.Depth_to_Groundwater.fillna(np.inf), ax=ax[2], color='dodgerblue', label = 'original')
ax[2].set_title(f'Feature: Depth_to_Groundwater', fontsize=14)
ax[2].set_ylabel(ylabel='Depth_to_Groundwater', fontsize=14)
ax[2].set_xlim([date(2012, 1, 1), date(2015, 1, 1)])
# ax[2].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

plt.tight_layout()
plt.show()

In [None]:
df['Drainage_Volume'] = df['Drainage_Volume'].interpolate()
df['River_Hydrometry'] = df['River_Hydrometry'].interpolate()
df['Depth_to_Groundwater'] = df['Depth_to_Groundwater'].interpolate()

## 2.3 Resampling

To obtain more information from the data, I will resample the data.

In [None]:
fig, ax = plt.subplots(ncols=4, nrows=4, sharex=True, figsize=(16,12))

ax[0, 0].bar(df.Date, df.Rainfall, width=5, color='dodgerblue')
ax[0, 0].set_title('Daily Rainfall (Acc.)', fontsize=14)

resampled_df = df[['Date','Rainfall']].resample('7D', on='Date').sum().reset_index(drop=False)
ax[1, 0].bar(resampled_df.Date, resampled_df.Rainfall, width=10, color='dodgerblue')
ax[1, 0].set_title('Weekly Rainfall (Acc.)', fontsize=14)

resampled_df = df[['Date','Rainfall']].resample('M', on='Date').sum().reset_index(drop=False)
ax[2, 0].bar(resampled_df.Date, resampled_df.Rainfall, width=15, color='dodgerblue')
ax[2, 0].set_title('Monthly Rainfall (Acc.)', fontsize=14)

resampled_df = df[['Date','Rainfall']].resample('12M', on='Date').sum().reset_index(drop=False)
ax[3, 0].bar(resampled_df.Date, resampled_df.Rainfall, width=20, color='dodgerblue')
ax[3, 0].set_title('Annual Rainfall (Acc.)', fontsize=14)

for i in range(4):
    ax[i, 0].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

sns.lineplot(df.Date, df.Temperature, color='dodgerblue', ax=ax[0, 1])
ax[0, 1].set_title('Daily Temperature (Acc.)', fontsize=14)

resampled_df = df[['Date','Temperature']].resample('7D', on='Date').mean().reset_index(drop=False)
sns.lineplot(resampled_df.Date, resampled_df.Temperature, color='dodgerblue', ax=ax[1, 1])
ax[1, 1].set_title('Weekly Temperature (Acc.)', fontsize=14)

resampled_df = df[['Date','Temperature']].resample('M', on='Date').mean().reset_index(drop=False)
sns.lineplot(resampled_df.Date, resampled_df.Temperature, color='dodgerblue', ax=ax[2, 1])
ax[2, 1].set_title('Monthly Temperature (Acc.)', fontsize=14)

resampled_df = df[['Date','Temperature']].resample('365D', on='Date').mean().reset_index(drop=False)
sns.lineplot(resampled_df.Date, resampled_df.Temperature, color='dodgerblue', ax=ax[3, 1])
ax[3, 1].set_title('Annual Temperature (Acc.)', fontsize=14)

for i in range(4):
    ax[i, 1].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])
    ax[i, 1].set_ylim([-5, 35])

sns.lineplot(df['Date'], df['Drainage_Volume'], color='dodgerblue', ax=ax[0, 2])
ax[0, 2].set_title('Drainage_Volume', fontsize=14)

resampled_df = df[['Date','Drainage_Volume']].resample('7D', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['Drainage_Volume'], color='dodgerblue', ax=ax[1, 2])
ax[1, 2].set_title('Weekly Drainage Volume', fontsize=14)

resampled_df = df[['Date','Drainage_Volume']].resample('M', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['Drainage_Volume'], color='dodgerblue', ax=ax[2, 2])
ax[2, 2].set_title('Monthly Drainage Volume', fontsize=14)

resampled_df = df[['Date','Drainage_Volume']].resample('365D', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['Drainage_Volume'], color='dodgerblue', ax=ax[3, 2])
ax[3, 2].set_title('Annual Drainage Volume', fontsize=14)

for i in range(4):
    ax[i, 2].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])
    
sns.lineplot(df['Date'], df['River_Hydrometry'], color='dodgerblue', ax=ax[0, 3])
ax[0, 3].set_title('River_Hydrometry', fontsize=14)

resampled_df = df[['Date','River_Hydrometry']].resample('7D', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['River_Hydrometry'], color='dodgerblue', ax=ax[1, 3])
ax[1, 3].set_title('Weekly River_Hydrometry', fontsize=14)

resampled_df = df[['Date','River_Hydrometry']].resample('M', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['River_Hydrometry'], color='dodgerblue', ax=ax[2, 3])
ax[2, 3].set_title('Monthly River_Hydrometry', fontsize=14)

resampled_df = df[['Date','River_Hydrometry']].resample('365D', on='Date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['Date'], resampled_df['River_Hydrometry'], color='dodgerblue', ax=ax[3, 3])
ax[3, 3].set_title('Annual River_Hydrometry', fontsize=14)

for i in range(4):
    ax[i, 3].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

plt.show()

Weekly data seems to be sufficient to show the cycle of data. Therefore, I will downsample the data to a weekly basis.

In [None]:
df_downsampled = df[['Date',
                     'Depth_to_Groundwater', 
                     'Temperature',
                     'Drainage_Volume', 
                     'River_Hydrometry'
                    ]].resample('7D', on='Date').mean().reset_index(drop=False)

df_downsampled['Rainfall'] = df[['Date',
                                 'Rainfall'
                                ]].resample('7D', on='Date').sum().reset_index(drop=False)[['Rainfall']]

df = df_downsampled

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))

original_cols = ['Depth_to_Groundwater', 
                 'Temperature',
                 'Drainage_Volume', 'River_Hydrometry','Rainfall']

corrmat = df[original_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax, mask=mask)
ax.set_title('Correlation Matrix of Features', fontsize=16)

plt.show()

* Drainage_Volume still has the highest correlation coefficent with the target. River_Hydrometry has a large postive correlation with it. When more water is taken from the drinking water treatment plant, the groud-water level is low.  
* River_Hydrometry has a higher correlation with the target after preprocessing. Note that temperature has a negative correlation with it. When day is hot, ground-water level is low since water demand may increase. Rainfall has a positive correlation with it, which means that ground-water level increases when rains.

# 3 Feature Engineering

## 3.1 Decompose time series

To obatain the seasonality and trend, I will decomposing data. 

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

core_columns =  [
    'Rainfall', 'Temperature', 'Drainage_Volume', 
    'River_Hydrometry', 'Depth_to_Groundwater'
]

for column in core_columns:
    decomp = seasonal_decompose(df[column], period=52, model='additive', extrapolate_trend='freq')
    df[f"{column}_trend"] = decomp.trend
    df[f"{column}_seasonal"] = decomp.seasonal

In [None]:
f, ax = plt.subplots(nrows=5, ncols=1, figsize=(15, 12))
f.suptitle('Trend Components of Features', fontsize=16)

for i, column in enumerate(core_columns):
    sns.lineplot(x=df['Date'], y=df[column + '_trend'], ax=ax[i], color='dodgerblue')
    ax[i].set_ylabel(ylabel=column, fontsize=14)
    ax[i].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])
    
plt.tight_layout()
plt.show()

* Ground-water depth decreased significantly from 2011-2012, accompanied by an decrease in drainage volume. This mighe be related to the decrease in rainfall from 2010 till 2012. 
* From late 2012 to 2015, ground-water depth increased significantly, associated with an increase in rainfall. 
* Ground-water depth decreased continuously from 2015 and onwards.

In [None]:
f, ax = plt.subplots(nrows=5, ncols=1, figsize=(15, 12))
f.suptitle('Seasonal Components of Features', fontsize=16)

for i, column in enumerate(core_columns):
    sns.lineplot(x=df['Date'], y=df[column + '_seasonal'], ax=ax[i], color='dodgerblue')
    ax[i].set_ylabel(ylabel=column, fontsize=14)
    ax[i].set_xlim([date(2010, 1, 1), date(2014, 1, 1)])
    
plt.tight_layout()
plt.show()

* Depth_to_Groundwater: reaches its maximum around May/June and its minimum around November
* Drainage_volume: reaches its maximum around May/June and its minimum around July.
* River_Hydrometry: reaches its maximum around February/March and its minimum around September
* Temperature: reaches its maxmium around August and its minimum around January

## 3.2 Shift Data ##
Now I want to check if there was a lead/lag correlation among data.

In [None]:
weeks_in_month = 4

for column in core_columns:
    df[f'{column}_shift_b_3m'] = df[f'{column}'].shift(-3 * weeks_in_month)
    df[f'{column}_shift_b_2m'] = df[f'{column}'].shift(-2 * weeks_in_month)
    df[f'{column}_shift_b_1m'] = df[f'{column}'].shift(-1 * weeks_in_month)
    df[f'{column}_shift_1m'] = df[f'{column}'].shift(1 * weeks_in_month)
    df[f'{column}_shift_2m'] = df[f'{column}'].shift(2 * weeks_in_month)
    df[f'{column}_shift_3m'] = df[f'{column}'].shift(3 * weeks_in_month)

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))

original_cols = ['Depth_to_Groundwater', 
                 'Temperature',
                 'Drainage_Volume', 'River_Hydrometry','Rainfall']

corrmat = df[original_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax, mask=mask)
ax.set_title('Correlation Matrix of Orignal Features', fontsize=14)

f, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
shifted_cols = [
    'Depth_to_Groundwater',
    'Temperature_shift_b_3m',
    'Temperature_shift_b_2m',
    'Temperature_shift_b_1m',
    'Temperature_shift_1m',
    'Temperature_shift_2m',
    'Temperature_shift_3m']
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[0,0], mask=mask)
ax[0,0].set_title('Correlation Matrix of Temperature Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater',
    'Drainage_Volume_shift_b_3m',
    'Drainage_Volume_shift_b_2m',
    'Drainage_Volume_shift_b_1m', 
    'Drainage_Volume_shift_1m',
    'Drainage_Volume_shift_2m',
    'Drainage_Volume_shift_3m']
corrmat = df[shifted_cols].corr()
# mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[0,1], mask=mask)
ax[0,1].set_title('Correlation Matrix of Drainage_Volume Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater',
    'River_Hydrometry_shift_b_3m',
    'River_Hydrometry_shift_b_2m',
    'River_Hydrometry_shift_b_1m', 
    'River_Hydrometry_shift_1m',
    'River_Hydrometry_shift_2m',
    'River_Hydrometry_shift_3m']
corrmat = df[shifted_cols].corr()

mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[1,0], mask=mask)
ax[1,0].set_title('Correlation Matrix of River_Hydrometry Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater',
    'Rainfall_shift_b_3m',
    'Rainfall_shift_b_2m',
    'Rainfall_shift_b_1m', 
    'Rainfall_shift_1m',
    'Rainfall_shift_2m',
    'Rainfall_shift_3m',
]
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[1,1], mask=mask)
ax[1,1].set_title('Correlation Matrix of Rainfall Features', fontsize=14)



plt.tight_layout()
plt.show()

In [None]:
weeks_in_month = 4

for column in core_columns:
    df[f'{column}_seasonal_shift_b_3m'] = df[f'{column}_seasonal'].shift(-3 * weeks_in_month)
    df[f'{column}_seasonal_shift_b_2m'] = df[f'{column}_seasonal'].shift(-2 * weeks_in_month)
    df[f'{column}_seasonal_shift_b_1m'] = df[f'{column}_seasonal'].shift(-1 * weeks_in_month)
    df[f'{column}_seasonal_shift_1m'] = df[f'{column}_seasonal'].shift(1 * weeks_in_month)
    df[f'{column}_seasonal_shift_2m'] = df[f'{column}_seasonal'].shift(2 * weeks_in_month)
    df[f'{column}_seasonal_shift_3m'] = df[f'{column}_seasonal'].shift(3 * weeks_in_month)

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))

original_cols = ['Depth_to_Groundwater_seasonal', 
                 'Temperature_seasonal',
                 'Drainage_Volume_seasonal', 'River_Hydrometry_seasonal','Rainfall_seasonal']

corrmat = df[original_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax, mask=mask)
ax.set_title('Correlation Matrix of Orignal Features', fontsize=14)

f, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
shifted_cols = [
    'Depth_to_Groundwater_seasonal',
    'Temperature_seasonal_shift_b_3m',
    'Temperature_seasonal_shift_b_2m',
    'Temperature_seasonal_shift_b_1m',
    'Temperature_seasonal_shift_1m',
    'Temperature_seasonal_shift_2m',
    'Temperature_seasonal_shift_3m']
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[0,0], mask=mask)
ax[0,0].set_title('Correlation Matrix of Temperature Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater_seasonal',
    'Drainage_Volume_seasonal_shift_b_3m',
    'Drainage_Volume_seasonal_shift_b_2m',
    'Drainage_Volume_seasonal_shift_b_1m', 
    'Drainage_Volume_seasonal_shift_1m',
    'Drainage_Volume_seasonal_shift_2m',
    'Drainage_Volume_seasonal_shift_3m']
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[0,1], mask=mask)
ax[0,1].set_title('Correlation Matrix of Drainage_Volume Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater_seasonal',
    'River_Hydrometry_seasonal_shift_b_3m',
    'River_Hydrometry_seasonal_shift_b_2m',
    'River_Hydrometry_seasonal_shift_b_1m', 
    'River_Hydrometry_seasonal_shift_1m',
    'River_Hydrometry_seasonal_shift_2m',
    'River_Hydrometry_seasonal_shift_3m']
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[1,0], mask=mask)
ax[1,0].set_title('Correlation Matrix of River_Hydrometry Features', fontsize=14)

shifted_cols = [
    'Depth_to_Groundwater_seasonal',
    'Rainfall_seasonal_shift_b_3m',
    'Rainfall_seasonal_shift_b_2m',
    'Rainfall_seasonal_shift_b_1m', 
    'Rainfall_seasonal_shift_1m',
    'Rainfall_seasonal_shift_2m',
    'Rainfall_seasonal_shift_3m',
]
corrmat = df[shifted_cols].corr()
mask = np.triu(np.ones_like(corrmat, dtype=np.bool))

sns.heatmap(corrmat, annot=True, vmin=-1, vmax=1, cmap='coolwarm_r', ax=ax[1,1], mask=mask)
ax[1,1].set_title('Correlation Matrix of Rainfall Features', fontsize=14)



plt.tight_layout()
plt.show()

Overall, shifted features all obtain a better correlation with the target.


* Temperature shifted back 2 months obtains higher correlation with the target.
* Drainage Volume shifted forward 1 month obtains the highest correlation with the target.
* River_Hydrometry shifted forward 3 months obtains the highest correlation with the target.
* Rainfall shifted forward 3 months obtains higher correlation with the target.

# 4 Modeling #

## 4.1 Prophet ##

Prophet is an open-source library for time series forecasting developed by Facebook. It is  based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

In [None]:
from fbprophet import Prophet

#prepare input data for model
#The input to Prophet is always a dataframe with two columns: ds and y. 

feature_columns = [
    'Rainfall',
    'Temperature',
    'Drainage_Volume',
    'River_Hydrometry',
]
target_column = ['Depth_to_Groundwater']

train_size = int(0.85 * len(df))

pro_df = df[['Date'] + target_column + feature_columns].copy()
pro_df.columns = ['ds', 'y'] + feature_columns

train = pro_df.iloc[:train_size, :]
train.head()

In [None]:
x_train, y_train = pd.DataFrame(pro_df.iloc[:train_size, [0,2,3,4,5]]), pd.DataFrame(pro_df.iloc[:train_size, 1])
x_test, y_test = pd.DataFrame(pro_df.iloc[train_size:, [0,2,3,4,5]]), pd.DataFrame(pro_df.iloc[train_size:, 1])

In [None]:
# Train the model
model = Prophet()
model.add_regressor('Rainfall')
model.add_regressor('Temperature')
model.add_regressor('Drainage_Volume')
model.add_regressor('River_Hydrometry')

model.fit(train)

# x_valid = model.make_future_dataframe(periods=test_size, freq='w')

# Predict on test set
y_pred = model.predict(x_test)

# Calcuate metrics
score_mae = mean_absolute_error(y_test, y_pred['yhat'])
score_rmse = math.sqrt(mean_squared_error(y_test, y_pred['yhat']))

print('MAE: {}'.format(score_mae))
print('RMSE: {}'.format(score_rmse))

In [None]:
# Plot the forecast
f, ax = plt.subplots(1, figsize=(8,4))

model.plot(y_pred, ax=ax)
sns.lineplot(x=x_test['ds'], y=y_test['y'], ax=ax, color='orange', label='Ground truth') 
sns.lineplot(x=x_train['ds'], y=y_train['y'], ax=ax, color='black', label='Train') 

ax.set_title(f'Prophet Prediction \n MAE: {score_mae:.2f}, RMSE: {score_rmse:.2f}', fontsize=14)
ax.set_xlabel(xlabel='Date', fontsize=14)
ax.set_ylabel(ylabel='Depth to Groundwater', fontsize=14)

plt.show()

## 4.2 XGBOOST Regressor ##

Use both origanal and shifted feature data as input.

In [None]:
from xgboost import XGBRegressor

In [None]:
y = df['Depth_to_Groundwater']
x_shift = df[['Temperature_shift_b_2m','Drainage_Volume_shift_1m','Rainfall_shift_3m','River_Hydrometry_shift_3m']].copy()
x_org = df[['Temperature','Drainage_Volume','Rainfall','River_Hydrometry']].copy()

train_size = 0.85
test_size = 1- train_size
x_org_train, x_org_test, y_org_train, y_org_test = train_test_split(x_org, y, train_size = train_size, shuffle = False)
x_shift_train, x_shift_test, y_shift_train, y_shift_test = train_test_split(x_shift, y, train_size = train_size, shuffle = False)

In [None]:
params = {'n_estimators': 1000,
          'max_depth': 5,
          'subsample': 1,
          #'learning_rate': 0.04,
          'random_state': 0}

model = XGBRegressor(**params)

In [None]:
# Orginal data
valid_size=int(len(x_org_train)*0.85)
x_org_valid=x_org_train.iloc[valid_size:,:]
y_org_valid=y_org_train.iloc[valid_size:]
model_org = model.fit(x_org_train.iloc[:valid_size,:], y_org_train.iloc[:valid_size],
                      early_stopping_rounds=5, 
                      # Early_stopping offers a way to automatically find the ideal value for n_estimators. 
                      #Early stopping causes the model to stop iterating when the validation score stops improving.
                      #Early stopping will determine the appropriate number of trees automatically.
                      eval_set=[(x_org_valid,y_org_valid)],
                      verbose=False)
# Predict on test set
y_org_pred = model.predict(x_org_test)

# Calcuate metrics
score_mae_org = mean_absolute_error(y_org_test, y_org_pred)
score_rmse_org = math.sqrt(mean_squared_error(y_org_test, y_org_pred))

print('MAE value: %.2f' %score_mae_org)
print('RMSE value: %.2f' %score_rmse_org)
print('Prediction: %.3f' % y_org_pred[0])

In [None]:
# Shifted data
params = {'n_estimators': 500,
          'max_depth': 3,
          'subsample': 0.9,
          #'learning_rate': 0.04,
          'random_state': 0}

model = XGBRegressor(**params)

In [None]:
valid_size=int(len(x_shift_train)*0.85)
x_shift_valid=x_shift_train.iloc[valid_size:,:]
y_shift_valid=y_shift_train.iloc[valid_size:]

model_shift =model.fit(x_shift_train.iloc[:valid_size,:], y_shift_train.iloc[:valid_size],
                       early_stopping_rounds=5, 
                       eval_set=[(x_shift_valid,y_shift_valid)],
                       verbose=False)
# Predict on test set
y_shift_pred = model.predict(x_shift_test)

# Calcuate metrics
score_mae_shift = mean_absolute_error(y_shift_test, y_shift_pred)
score_rmse_shift = math.sqrt(mean_squared_error(y_shift_test, y_shift_pred))

print('MAE value: %.2f' %score_mae_shift)
print('RMSE value: %.2f' %score_rmse_shift)
print('Prediction: %.3f' % y_shift_pred[0])

In [None]:
# Plot the forecast
f, ax = plt.subplots(1, figsize=(16,8))

sns.lineplot(x=df['Date'].iloc[int(train_size*len(df)):], y=y_org_pred, ax=ax, color='blue', label='Predicted ORG') 
sns.lineplot(x=df['Date'].iloc[int(train_size*len(df)):], y=y_shift_pred, ax=ax, color='green', label='Predicted Shifted') 

sns.lineplot(x=df['Date'].iloc[int(train_size*len(df)):], y=y_org_test, ax=ax, color='orange', label='Ground truth') 
sns.lineplot(x=df['Date'].iloc[:int(train_size*len(df))], y=y_org_train, ax=ax, color='black', label='Train') 

ax.set_title(f'XGBRegressor Prediction \n ORG MAE: {score_mae_org:.2f}, RMSE: {score_rmse_org:.2f} \n Shift MAE: {score_mae_shift:.2f}, RMSE: {score_rmse_shift:.2f}', fontsize=14)
ax.set_xlabel(xlabel='Date', fontsize=14)
ax.set_ylabel(ylabel='Depth to Groundwater', fontsize=14)

plt.show()

Shifted data doesn't gain a better model performance. 

In [None]:
bias = 0.02
imp_org = pd.DataFrame({'importance': model_org.feature_importances_,
                'features': x_org_train.columns}).sort_values('importance',ascending = False)
imp_shift = pd.DataFrame({'importance': model_shift.feature_importances_,
                'features': x_shift_train.columns}).sort_values('importance',ascending = False)

fig, ax = plt.subplots(nrows=2, ncols=1,figsize = (10, 4))
ax[0].set_title('Feature importances', size = 15, fontweight = 'bold', fontfamily = 'serif')

sns.barplot(x = imp_org.importance, y = imp_org.features, edgecolor = 'black',
        palette = reversed(sns.color_palette("viridis", len(imp_org.features))),ax=ax[0])

for i in ['top', 'right']:
    ax[0].spines[i].set_visible(None)

rects = ax[0].patches
labels = imp_org.importance
for rect, label in zip(rects, labels):
    x_value = rect.get_width() + bias
    y_value = rect.get_y() + rect.get_height() / 2
    ax[0].text(x_value, y_value, round(label, 3), fontsize = 9, color = 'black',
             ha = 'center', va = 'center')
ax[0].set_ylabel('Features', fontweight = 'bold', fontfamily = 'serif')


sns.barplot(x = imp_shift.importance, y = imp_shift.features, edgecolor = 'black',
        palette = reversed(sns.color_palette("viridis", len(imp_org.features))),ax=ax[1])

for i in ['top', 'right']:
    ax[1].spines[i].set_visible(None)

rects = ax[1].patches
labels = imp_shift.importance
for rect, label in zip(rects, labels):
    x_value = rect.get_width() + bias
    y_value = rect.get_y() + rect.get_height() / 2
    ax[1].text(x_value, y_value, round(label, 3), fontsize = 9, color = 'black',
             ha = 'center', va = 'center')
ax[1].set_ylabel('Features', fontweight = 'bold', fontfamily = 'serif')
ax[1].set_xlabel('Importance', fontweight = 'bold', fontfamily = 'serif')

plt.tight_layout()
plt.show()

Drainage Volume is the most important feature of predicting the target.

## 4.3 LSMT ##

LSMT is a recurrent neural network that is powrful in learning ong term dependencies in time series data.

First to prepare the data for the model.
* Create the dataset, ensure all data is float.
* Normalize the features.
* Split into training and test sets.
* Convert an array of values into a dataset matrix.
* Reshape into Xt=[x1,x2,x3,x4,x5]t and Y=t+1.Multivariate prediction. Single output.
* Reshape input to be 3D (num_samples, num_timesteps, num_features)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler

In [None]:
data = df.filter(['Temperature','Drainage_Volume','Rainfall','River_Hydrometry','Depth_to_Groundwater'])
print(data.columns)
# data = df.filter(['Depth_to_Groundwater'])
#Convert the dataframe to a numpy array
dataset = data.values.astype('float32')

In [None]:
# Defines the rolling window
look_back = 30
train_size=int(0.7*len(dataset))
val_size=int(0.85*len(dataset))

# Split into train and test sets
train, val, test = dataset[:train_size-look_back,:],dataset[train_size-look_back:val_size-look_back,:],dataset[val_size-look_back:,:]
# print(train.shape,val.shape,test.shape)

def create_dataset(dataset, look_back=1):
    X, Y = [], []
    
    for i in range(look_back, len(dataset)):
        a = dataset[i-look_back:i, :]
        X.append(a)
        Y.append(dataset[i, -1])
    return np.array(X), np.array(Y)

x_train, y_train = create_dataset(train, look_back)
x_val, y_val = create_dataset(val, look_back)
x_test, y_test = create_dataset(test, look_back)
print(x_train.shape,y_train.shape,x_val.shape,y_val.shape,x_test.shape,y_test.shape)

# scaler = StandardScaler()
# transformer = scaler.fit_transform(df)
scaler = MinMaxScaler(feature_range=(-1, 0))

x_train_scaled = scaler.fit_transform(np.reshape(x_train,(360,look_back*5)))
x_val_scaled = scaler.fit_transform(np.reshape(x_val,(60,look_back*5)))
x_test_scaled = scaler.fit_transform(np.reshape(x_test,(90,look_back*5)))

x_train_scaled = np.reshape(x_train_scaled,(360,look_back,5))
x_val_scaled = np.reshape(x_val_scaled,(60,look_back,5))
x_test_scaled = np.reshape(x_test_scaled,(90,look_back,5))

# input x has 3D structure [samples, time steps, features]
print(x_train_scaled.shape,y_train.shape,x_val_scaled.shape,y_val.shape,x_test_scaled.shape,y_test.shape)

In [None]:
y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_val_scaled = scaler.fit_transform(y_val.reshape(-1, 1)).flatten()
y_test_scaled = scaler.fit_transform(y_test.reshape(-1, 1)).flatten()

In [None]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
from tensorflow.keras.models import Sequential
from keras.layers import Dense, LSTM
from keras import callbacks

#Build the LSTM model
model = Sequential()
#input_sahep=(time steps, features)
model.add(LSTM(128, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
# early_stopping = callbacks.EarlyStopping(monitor='val_loss',patience=2,mode='min')
model.compile(optimizer='adam', loss='mean_squared_error')


#Train the model
history=model.fit(x_train_scaled, y_train_scaled, batch_size=8, epochs=30, validation_data=(x_val_scaled, y_val_scaled), verbose=2, shuffle=False)
#                  callbacks=[early_stopping])

model.summary()

In [None]:
# plot history
f, ax = plt.subplots(1,figsize = (8, 6))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.xlim([0,29])
plt.ylabel('MSE')
plt.xlabel('Epochs')
plt.legend()
plt.show()
 

In [None]:
## predict with the model
print(x_test_scaled.shape)
test_predict = model.predict(x_test_scaled)

In [None]:
inv_test_predict = scaler.inverse_transform(test_predict)[:,0]
inv_test_predict.shape

In [None]:
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, inv_test_predict))
mae = mean_absolute_error(y_test, inv_test_predict)
print('Test RMSE: %.3f' % rmse)
print('Test MAE: %.3f' % mae)

In [None]:
x_train_ticks = df['Date'].iloc[:train_size]
y_train = df['Depth_to_Groundwater'].iloc[:train_size]
x_val_ticks = df['Date'].iloc[train_size:val_size]
y_val = df['Depth_to_Groundwater'].iloc[train_size:val_size]
x_test_ticks = df['Date'].iloc[val_size:]
y_test = df['Depth_to_Groundwater'].iloc[val_size:]


# Plot the forecast
f, ax = plt.subplots(1,figsize=(15,6))

sns.lineplot(x=x_train_ticks, y=y_train, ax=ax, label='train') #navajowhite
sns.lineplot(x=x_val_ticks, y=y_val, ax=ax, label='validate') #navajowhite
sns.lineplot(x=x_test_ticks, y=y_test, ax=ax, color='orange', label='Ground truth') #navajowhite
sns.lineplot(x=x_test_ticks, y=inv_test_predict, ax=ax, color='green', label='Prediction') #navajowhite

ax.set_title(f'LSTM Prediction \n MAE: {mae:.2f}, RMSE: {rmse:.2f}', fontsize=14)
ax.set_xlabel(xlabel='Date', fontsize=14)
ax.set_ylabel(ylabel='Depth to Groundwater', fontsize=14)

plt.show()

I want to check the contribution of eacn feature using shap.

In [None]:
import shap
print(shap.__version__)

In [None]:
explainer = shap.DeepExplainer(model, x_train_scaled)
shap_values = explainer.shap_values(x_test_scaled)

In [None]:
# init the JS visualization code
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][:][:][:-1].sum(axis=1), ['Temperature','Drainage_Volume','Rainfall','River_Hydrometry'])

River_Hydrometry and drainage volume have the largest influences on depth to groundwater.

# 5 Conclusion #
* I used three ML models to conduct multivariate predition. 
* LSTM model performs best out of Porphet and XGBoostRegressor, which yields MAE of 0.23 and RMSE of 0.27. River Hydrometry and Volume Drainage contribute the most Depth to Goundwater. 