# Data Ingest

In [None]:
import pandas as pd

FILE_LOCATION = '/kaggle/input/daily-website-visitors/daily-website-visitors.csv'

whole_dataset = pd.read_csv(FILE_LOCATION, 
                            index_col='Date',
                            thousands=',')
whole_dataset.index = pd.to_datetime(whole_dataset.index)
whole_dataset

In [None]:
whole_dataset.info()

In [None]:
whole_dataset.describe()

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, figsize=(12, 5))

axs[0].plot(whole_dataset['First.Time.Visits'])
axs[1].plot(whole_dataset['Unique.Visits'])
axs[2].plot(whole_dataset['Returning.Visits'])
plt.show()

# Preprocessing the data

* Target Attribute: **Returning.Visits** 
We shall predict the **Returning.Visits** given past data.


In [None]:
target_column = whole_dataset['Returning.Visits']
target_column

In [None]:
target_column.plot(figsize=(15, 3))
plt.show()

# Compute Train and Test Data Boundaries

In [None]:
len(target_column)

In [None]:
TEST_DATA_PERCENTAGE = 0.1

TEST_DATA_BOUNDARY_INDEX = int((1 - TEST_DATA_PERCENTAGE) * len(target_column))
print(f"Train data:\tReturning Visits [:{TEST_DATA_BOUNDARY_INDEX}] ({TEST_DATA_BOUNDARY_INDEX + 1})")
print(f"Test data:\tReturning Visits [{TEST_DATA_BOUNDARY_INDEX}:] ({len(target_column) - TEST_DATA_BOUNDARY_INDEX})")
print(f"\nLast target on train data: {target_column[TEST_DATA_BOUNDARY_INDEX]}")

In [None]:
print(f"Train dataset ending values: {target_column[TEST_DATA_BOUNDARY_INDEX - 10: TEST_DATA_BOUNDARY_INDEX].values}")
print(f"Test dataset starting values: {target_column[TEST_DATA_BOUNDARY_INDEX: TEST_DATA_BOUNDARY_INDEX + 10].values}")

## Window-ize the dataset

In [None]:
from tensorflow.keras.utils import timeseries_dataset_from_array

WINDOW_SIZE = 3
train_dataset = timeseries_dataset_from_array(target_column[:-WINDOW_SIZE], 
                                                 target_column[WINDOW_SIZE:], 
                                                 sequence_length=WINDOW_SIZE,
                                                 end_index=TEST_DATA_BOUNDARY_INDEX - 1)
len(train_dataset), len(list(train_dataset.unbatch()))

In [None]:
target_column[TEST_DATA_BOUNDARY_INDEX-10:TEST_DATA_BOUNDARY_INDEX+10].values, (list(train_dataset)[-1][0][-1].numpy(), list(train_dataset)[-1][1][-1].numpy())

In [None]:
test_dataset = timeseries_dataset_from_array(target_column[TEST_DATA_BOUNDARY_INDEX - WINDOW_SIZE:], 
                                                 target_column[TEST_DATA_BOUNDARY_INDEX:], 
                                                 sequence_length=WINDOW_SIZE
                                            )
len(test_dataset), len(list(test_dataset.unbatch()))

In [None]:
target_column[TEST_DATA_BOUNDARY_INDEX-10:TEST_DATA_BOUNDARY_INDEX+10].values, list(test_dataset)[0][0][0].numpy(), list(test_dataset)[0][1][0].numpy()

In [None]:
# First point in test dataset
list(test_dataset)[0][0][0].numpy(), list(test_dataset)[0][1][0].numpy()

In [None]:
# Last point in test dataset
list(test_dataset)[-1][0][-1].numpy(), list(test_dataset)[-1][1][-1].numpy()

## Plot the train and test datasets

In [None]:
import numpy as np 
import matplotlib.dates as mdates

def plot_time_series(predictions = None, start_index=1500):
    timesteps = pd.to_datetime(target_column.index)
    
    fig,ax = plt.subplots(1,figsize=(15,5))
    ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 7)))
    ax.xaxis.set_minor_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
    
    # Plot train dataset
    plt.plot(timesteps[start_index:TEST_DATA_BOUNDARY_INDEX], target_column[start_index:TEST_DATA_BOUNDARY_INDEX],
            color='blue')
    # Plot test dataset
    plt.plot(timesteps[TEST_DATA_BOUNDARY_INDEX:], target_column[TEST_DATA_BOUNDARY_INDEX:],
             color='green', linewidth=0.4)
    
    if predictions is not None:
        pred_timesteps = timesteps[TEST_DATA_BOUNDARY_INDEX:]
        plt.plot(pred_timesteps, predictions, linewidth=0.4, color='red')
        plt.scatter(pred_timesteps, predictions, s=0.4, color='red')

        
plot_time_series()

# Model 0: Baseline model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model

class NaiveForecastLayer(Model):
    def __init__(self):
        super().__init__()
        
    def call(self, inputs):
        result = inputs[:, -1]
        return result[:, tf.newaxis]


In [None]:
baseline_model = NaiveForecastLayer()
baseline_model._name = 'model_0'

baseline_model.compile(metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
baseline_predictions = baseline_model.predict(test_dataset)

In [None]:
plot_time_series(baseline_predictions.ravel(), start_index=1900)

In [None]:
y_true = target_column[TEST_DATA_BOUNDARY_INDEX : ]

len(y_true), y_true

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

def evaluate_predictions(y_true, y_preds):
    mae = mean_absolute_error(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_preds)
    
    return {
        'mae': mae,
        'mse': mse, 
        "rmse": rmse,
        "mape": mape
    }

evaluate_predictions(y_true, baseline_predictions)

In [None]:
MODEL_METRICS = pd.DataFrame(columns=['mae', 'mse', 'rmse', 'mape'])

def evaluate_model(model):
    predictions = model.predict(test_dataset, verbose=0)
    metrics = evaluate_predictions(y_true, predictions)
    
    MODEL_METRICS.loc[model.name] = metrics
    plot_time_series(predictions.ravel(), start_index=1900)
    return metrics


In [None]:
evaluate_model(baseline_model)

In [None]:
MODEL_METRICS

# Model 1: Recurrent Network Model (GRU)

In [None]:
from tensorflow.keras.layers import GRU, Dense, Input, Lambda
from tensorflow.keras import Sequential

tf.random.set_seed(42)
model_1 = Sequential([
    Input(shape=(WINDOW_SIZE,)),
    Lambda(lambda x: tf.expand_dims(x, axis=1)),
    GRU(128, activation="relu"),
    Dense(1)
], name='model_1')

model_1.compile(
    loss=tf.keras.losses.MeanAbsoluteError(),
    optimizer=tf.keras.optimizers.Adam()
)

model_1.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
import os

def create_checkpoint_callback(model):
    filepath = os.path.join('models', model.name)
    return ModelCheckpoint(filepath, monitor='loss', save_weights_only=True, save_best_only=True)

model_1.fit(train_dataset, epochs=5, callbacks=[ create_checkpoint_callback(model_1) ])

In [None]:
evaluate_model(model_1)

In [None]:
MODEL_METRICS

# Model 3: Multi-input Model

In [None]:
unbatched_train_dataset = whole_dataset[:TEST_DATA_BOUNDARY_INDEX + 1].copy()
unbatched_train_dataset

DA: Any significant different per day, per month, per year?

## Per `Day of Week` grouping

In [None]:
dataset_by_day = unbatched_train_dataset.groupby(by=['Day'])
dataset_by_day['Returning.Visits'].mean()

In [None]:
DAYS_OF_WEEK = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pd.DataFrame(dataset_by_day['Returning.Visits'].mean()).loc[DAYS_OF_WEEK].plot(kind='bar')

In [None]:
dataset_by_day['Returning.Visits'].hist(legend=True, alpha=0.5)
plt.show()


In [None]:
import calendar

train_dataset_with_months = unbatched_train_dataset.copy()
train_dataset_with_months['Month.Name'] = pd.Series(train_dataset_with_months.index, 
                                                    index=train_dataset_with_months.index)\
                                            .apply(lambda x: calendar.month_name[x.month])
train_dataset_with_months

In [None]:
MONTH_NAMES = list(calendar.month_name)[1:]
dataset_group_by_month = train_dataset_with_months.groupby(by='Month.Name')
dataset_group_by_month['Returning.Visits'].mean().loc[MONTH_NAMES]

In [None]:
pd.DataFrame(dataset_group_by_month['Returning.Visits'].mean()).loc[MONTH_NAMES].plot(kind='bar')
plt.show()

## Prepare the dataset

In [None]:
train_dataset_with_months

In [None]:
dataset2 = train_dataset_with_months.copy()[['Day', 'Month.Name', 'Returning.Visits']]
dataset2

In [None]:
def windowize_dataset(dataset):
    for i in range(WINDOW_SIZE):
        dataset[f'Returning.Visits[t-{i+1}]'] = dataset['Returning.Visits'].shift(periods=i+1)
    return dataset

dataset2 = windowize_dataset(dataset2.copy())
dataset2

In [None]:
dataset2 = dataset2.dropna()
dataset2

In [None]:
rv_cols = [f"Returning.Visits[t-{i+1}]" for i in range(WINDOW_SIZE)]

dataset2_rv_history_features = dataset2[rv_cols]
dataset2_rv_history_features

In [None]:
dataset2_cat_features =  dataset2[['Day', 'Month.Name']]
dataset2_cat_features

In [None]:
train_dataset2 = dataset2['Returning.Visits']
train_dataset2

## Building the model

In [None]:
from tensorflow.keras.layers import Concatenate, Dropout

tf.random.set_seed(42)
def build_model_3():
    seq_input = Input(shape=(WINDOW_SIZE,))
    lambda_layer = Lambda(lambda x: x[:, tf.newaxis])(seq_input)
    rnn_layer = GRU(64, activation='relu')(lambda_layer)

    cat_input = Input(shape=(2,))
    cat_dense_layer = Dense(32, activation='relu')(cat_input)
    
    concat_layer = Concatenate()([rnn_layer, cat_dense_layer])
    dense_layer1 = Dense(128, activation='relu')(concat_layer)
    dropout_layer = Dropout(0.5)(dense_layer1)
    output_layer = Dense(1, activation='linear')(dropout_layer)
    
    return Model(inputs=[seq_input, cat_input], outputs=output_layer, name="model_3")
    
model_3 = build_model_3()
model_3.compile(
    loss=tf.keras.losses.MeanAbsoluteError(),
    optimizer=tf.keras.optimizers.Adam()
)

model_3.summary()

In [None]:
tf.keras.utils.plot_model(model_3)

In [None]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
X_cat_encoder = OrdinalEncoder(categories = [DAYS_OF_WEEK, MONTH_NAMES])
X_cat_encoded = X_cat_encoder.fit_transform(dataset2_cat_features)
X_cat_encoded, X_cat_encoder.categories_

In [None]:
from tensorflow.data import Dataset

model3_history = model_3.fit(x=[dataset2_rv_history_features, X_cat_encoded], y=train_dataset2, epochs=5)
pd.DataFrame(model3_history.history).plot()

In [None]:
test_dataset2 = windowize_dataset(whole_dataset[TEST_DATA_BOUNDARY_INDEX-WINDOW_SIZE:].copy())
test_dataset2['Month.Name'] = pd.Series(test_dataset2.index, index=test_dataset2.index)\
                        .apply(lambda x: calendar.month_name[x.month])
test_dataset2 = test_dataset2.dropna()
test_dataset2

In [None]:
X_test_rv_history_input = test_dataset2[rv_cols]
X_test_rv_history_input

In [None]:
X_test_cat_input = test_dataset2[['Day', 'Month.Name']]
X_test_cat_input = X_cat_encoder.transform(X_test_cat_input) 
X_test_cat_input.shape, X_test_cat_input[:5]

In [None]:
model_3_preds = model_3.predict([X_test_rv_history_input, X_test_cat_input])
model_3_preds[:15]

In [None]:
y_dataset = test_dataset2['Returning.Visits']
y_dataset

In [None]:
def evaluate_model_predictions(y_true, predictions, model_name):
    metrics = evaluate_predictions(y_true, predictions)
    
    MODEL_METRICS.loc[model_name] = metrics
    plot_time_series(predictions.ravel(), start_index=1900)
    return metrics

evaluate_model_predictions(y_dataset, model_3_preds, 'model_3 (multi-input)')

In [None]:
MODEL_METRICS

# Model 4: Ensemble methods

In [None]:
def build_model_5(n_models, loss_fns):
    models = []
    for loss_fn in loss_fns:
        print(f"Training {n_models} models for {loss_fn} loss...")
        for i in range(n_models):
            model = Sequential([
                Input(shape=(WINDOW_SIZE,)),
                Lambda(lambda x: tf.expand_dims(x, axis=1)),
                GRU(128, activation='relu'),
                Dense(1, activation='linear')
            ])
            
            model.compile(loss=loss_fn, optimizer=tf.keras.optimizers.Adam())
            models.append(model)
            
    
    return models


model_5 = build_model_5(n_models=5, loss_fns=['mae', 'mse', 'mape'])
model_5

In [None]:
model_5[0].summary()

In [None]:
for i, model in enumerate(model_5):
    print(f"Training model {i+1} out of {len(model_5)} models")
    model.fit(train_dataset, epochs=5, verbose=0)

In [None]:
def ensemble_prediction(models):
    predictions = []
    for model in models:
        pred = model.predict(test_dataset, verbose=0)
        predictions.append(pred)
    
    return np.array(predictions)

model_5_all_preds = ensemble_prediction(model_5)
model_5_all_preds.shape

In [None]:
model_5_all_preds.shape

In [None]:
def aggregate_ensemble_predictions(predictions):
    return tf.reduce_mean(predictions, axis=0).numpy()

model_5_preds = aggregate_ensemble_predictions(model_5_all_preds)
model_5_preds.shape

In [None]:
evaluate_model_predictions(y_true, model_5_preds, 'model_5 (ensemble)')

In [None]:
MODEL_METRICS