In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Image Processing with Neural Network

## Session 19b: Simple RNN
- Weather Data 
- Multiple features

<img src='../../../images/prasami_color_tutorials_small.png' width='400' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

In [None]:
###-----------------
### Import Libraries
###-----------------

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf

from utils.helper import fn_plot_tf_hist

In [None]:
###----------------------
### Some basic parameters
###----------------------

inpDir = '../../../input' # location where input data is stored
outDir = '../output' # location to store outputs

RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production
np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible results
tf.random.set_seed(RANDOM_STATE) # setting for Tensorflow as well

EPOCHS = 200  # number of cycles to run
ALPHA = 0.001  # learning rate
TEST_SIZE = 0.2 # What fraction we want to keep for testing
BATCH_SIZE = 32
PATIENCE = 20
LR_PATIENCE = 10
LR_FACTOR = 0.1

# Set parameters for decoration of plots
params = {'legend.fontsize' : 'large',
          'figure.figsize'  : (12,9),
          'axes.labelsize'  : 'x-large',
          'axes.titlesize'  :'x-large',
          'xtick.labelsize' :'large',
          'ytick.labelsize' :'large',
         }
CMAP = plt.cm.coolwarm

plt.rcParams.update(params) # update rcParams

## Helper Function

In [None]:
### Settings so that Tensorflow can not Hog all the GPU memory
physical_devices = tf.config.list_physical_devices('GPU') 

if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Load Weather Data
Source: [Kaggle](https://www.kaggle.com/muthuj7/weather-dataset)

In [None]:
dataFilename = 'weatherHistory.csv'
data_df = pd.read_csv(os.path.join(inpDir, dataFilename))
data_df.head()

In [None]:
data_df.shape

In [None]:
data_df.info()

In [None]:
data_df.describe()

In [None]:
for col in data_df.columns:
    print(f'{col} : {data_df[col].unique()}')

### Notes:
- Precip Type has nan

In [None]:
data_df[data_df['Precip Type'].isnull()]

In [None]:
data_df['Precip Type'] = data_df['Precip Type'].fillna('No')
data_df[data_df['Precip Type'].isnull()]

In [None]:
cat_cols = ['Summary', 'Precip Type']

for count, col in enumerate(cat_cols):
                           
    fig, ax = plt.subplots()
    
    colCount = data_df[col].value_counts()
        
    ax.set_title(col)
    
    ax.set_xlabel('Frequency')
    
    #sns.countplot(data_df, y = col, ax = ax)
    sns.histplot(data_df, y = col, ax = ax)

In [None]:
num_cols = ['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 
            'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
fig, axes = plt.subplots(2,4)

axes = axes.ravel()

for count, col in enumerate(num_cols):
    
    ax =axes[count]
    
    sns.histplot(data_df, x = col, ax = ax, bins = 50)

plt.tight_layout()

In [None]:
data_df.loc[data_df['Pressure (millibars)']<500] ['Precip Type']

In [None]:
# calculate class
classmeans = data_df.pivot_table('Pressure (millibars)', columns='Precip Type', aggfunc='mean')
classmeans

In [None]:
data_df.loc[data_df['Pressure (millibars)'] == 0]

In [None]:
data_df['Pressure (millibars)'].replace(0, np.nan, inplace=True)
data_df.loc[data_df['Pressure (millibars)'] == 0]

In [None]:
data_df['Pressure (millibars)'] = data_df[['Pressure (millibars)', 'Precip Type']].apply(
    lambda x: classmeans['rain']['Pressure (millibars)'] if pd.isnull(x['Pressure (millibars)']) else
    x['Pressure (millibars)'], axis=1 )

In [None]:
data_df[data_df['Pressure (millibars)'].isnull()]

In [None]:
data_df[['Precip Type', 'Temperature (C)']].groupby(['Precip Type'], as_index = False).mean()

In [None]:
num_cols = ['Temperature (C)','Apparent Temperature (C)',
 'Humidity',
 'Wind Speed (km/h)',
 'Wind Bearing (degrees)',
 'Visibility (km)',
 'Pressure (millibars)']

fig, axes = plt.subplots(2,4)

axes = axes.ravel()

for count, col in enumerate(num_cols):
    sns.boxplot(y=col, data=data_df, ax = axes[count])

plt.tight_layout()
# Show the plot
plt.show()

In [None]:
data_df[num_cols].corr()

In [None]:
sns.heatmap(data_df[num_cols].corr(), annot=True, cmap=plt.cm.Blues, linewidths = .2)
plt.show()

### Notes
- We can drop Summary , Loud Cover, and Daily Summary columns
- Pressure has some 0 values

In [None]:
data_df.columns

In [None]:
data_df['datetime'] = pd.to_datetime(data_df['Formatted Date'], 
                                     utc=True)

In [None]:
drop_columns = ['Daily Summary', 'Summary', 'Loud Cover', 'Wind Speed (km/h)', 'Wind Bearing (degrees)','Formatted Date'	]
data_df = data_df.drop(drop_columns, axis = 1)
data_df.head()

In [None]:
le = LabelEncoder()
data_df['Precip Type'] = le.fit_transform(data_df['Precip Type'])
le.classes_

In [None]:
tmp_df = data_df.copy()
tmp_df = tmp_df.sort_values('datetime', axis=0, ascending=True)
tmp_df = tmp_df.reset_index(drop=True)
tmp_df.head()

In [None]:
tmp_df = tmp_df.rename({'Temperature (C)': 'temp',
                        'Apparent Temperature (C)':'app_t',
                        'Humidity': 'hum',
                        'Pressure (millibars)': 'pres',
                        'Precip Type': 'precip', 
                        'Visibility (km)': 'vis'	}, axis=1)
tmp_df.head()

In [None]:
# comment un-comment following lines if you want part or full dataset

#startDate = pd.to_datetime('2007-1-1', utc=True)
#endDate = pd.to_datetime('2008-1-1', utc=True)
#temp_df = temp_df[(temp_df['datetime']  >= startDate) & (temp_df['datetime']  < endDate)]

## Plotting samples

In [None]:
fig = plt.figure(figsize = (15,10))

ax = fig.add_subplot(2, 3, 1)
tmp_df.plot(x='datetime', y='temp', style=".", ax = ax);

ax = fig.add_subplot(2, 3, 2)
tmp_df.plot(x='datetime', y='app_t', style=".", ax = ax);

ax = fig.add_subplot(2, 3, 3)
tmp_df.plot(x='datetime', y='hum', style=".", ax = ax);

ax = fig.add_subplot(2, 3, 4)
tmp_df.plot(x='datetime', y='vis', style=".", ax = ax);


ax = fig.add_subplot(2, 3, 5)
tmp_df.plot(x='datetime', y='pres', style=".", ax = ax);


ax = fig.add_subplot(2, 3, 6)
tmp_df.plot(x='datetime', y='precip', style=".", ax = ax);


plt.tight_layout()

In [None]:
time_step = 24

In [None]:
y_idx = np.arange(time_step, tmp_df.shape[0], time_step)
y_df = tmp_df.iloc[y_idx][['temp', 'datetime']]
y_df.head()

In [None]:
## Limit it to complete days

tmp_df = tmp_df.iloc[range(len(y_df) * time_step)]
tmp_df

In [None]:
tmp_df['temp'].shape, tmp_df.shape

In [None]:
y_df.shape

In [None]:
X_temp = np.reshape(tmp_df['temp'].to_numpy(), (y_df.shape[0], time_step))
X_at = np.reshape(tmp_df['app_t'].to_numpy(), (y_df.shape[0], time_step))
X_hum = np.reshape(tmp_df['hum'].to_numpy(), (y_df.shape[0], time_step))
X_precip = np.reshape(tmp_df['precip'].to_numpy(), (y_df.shape[0], time_step))
X_vis = np.reshape(tmp_df['vis'].to_numpy(), (y_df.shape[0], time_step))
X_pres = np.reshape(tmp_df['pres'].to_numpy(), (y_df.shape[0], time_step))
X_temp.shape

In [None]:
# Take 23 cols only 
X_temp = X_temp[:, :23]
X_at = X_at[:, :23]
X_hum = X_hum[:, :23]
X_precip = X_precip[:, :23]
X_vis = X_vis[:, :23]
X_pres = X_pres[:, :23]
X_temp.shape

## Note

Need to reshape the inputs into the 3D format as expected by the SimpleRNNs, namely [samples, timesteps, features].

In [None]:
# play with what features you want to use
X_data = np.stack((X_temp, 
                   X_at , 
                   X_hum , 
                   #X_precip, 
                   #X_vis, 
                   #X_pres
                  ), axis = 2)
#X_data = np.reshape(X_temp, (X_temp.shape[0], X_temp.shape[1], 1))
X_data.shape

In [None]:
split = int(y_df.shape[0] * (1-TEST_SIZE))
X_train = X_data[:split]
X_test = X_data[split:]
y_train = y_df['temp'].values[:split]
y_test = y_df['temp'].values[split:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
h_units = 64 # number of units in the RNN

input_shape=(time_step-1, X_train.shape[2] ) # we are using five features

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Input(input_shape))

model.add(tf.keras.layers.SimpleRNN(units = h_units, 
                                    activation = 'tanh'))

model.add(tf.keras.layers.Dense(1, activation = 'linear'))

model.compile(loss='mean_squared_error', optimizer='adam', 
              metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
model.summary()

In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                           patience=PATIENCE,
                                                           mode='auto',
                                                           baseline =None,
                                                           restore_best_weights=True)


reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=LR_FACTOR,
                                                 patience=LR_PATIENCE,
                                                 verbose=0
                                                )

In [None]:
history = model.fit(X_train, y_train,
                    epochs=EPOCHS, 
                    validation_data=(X_test, y_test),
                    batch_size= BATCH_SIZE, 
                    verbose=2,
                    callbacks = [early_stopping_callback, reduce_lr])

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df = hist_df.rename({'root_mean_squared_error': 'rmse', 'val_root_mean_squared_error' : 'val_rmse'}, axis=1)


fn_plot_tf_hist(hist_df)
#.8172

In [None]:
# make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_pred = np.append(y_train_pred, y_test_pred)

In [None]:
y_df.head()

In [None]:
res_df = y_df.copy()
res_df['pred'] = y_pred
res_df['datetime'] = res_df['datetime'].dt.date
res_df.head()

In [None]:
res_df.tail()

In [None]:
fig, ax = plt.subplots(figsize = (15,6))

res_df.plot(x='datetime', y=['temp','pred'], ax = ax);

ax.vlines(res_df.iloc[X_train.shape[0]]['datetime'], 
          res_df['temp'].min(), 
          res_df['temp'].max(), color = 'k', 
          linewidth=3.0, zorder=10, alpha =0.8)

ax.grid()