In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
ds = pd.read_csv("/content/2018-2019duplicate.csv")
ds.describe()

Unnamed: 0.1,Unnamed: 0,diffmins
count,614.0,614.0
mean,306.5,20737.237785
std,177.390811,8869.976794
min,0.0,5055.0
25%,153.25,15003.0
50%,306.5,17826.5
75%,459.75,26334.0
max,613.0,44018.0


In [None]:
ds.head(2)

Unnamed: 0.1,Unnamed: 0,connect date,diffmins
0,0,4/25/18,18212
1,1,4/26/18,22092


In [None]:
df = ds.filter(items=['connect time','connect date','diffmins'],
                axis=1)

In [None]:
df.isnull().sum()

connect date    0
diffmins        0
dtype: int64

In [None]:
df['connect date'] = pd.to_datetime(df['connect date'])
df.head(2)

Unnamed: 0,connect date,diffmins
0,2018-04-25,18212
1,2018-04-26,22092


In [None]:
df.sort_values(by='connect date', inplace=True)
df.head(2)

Unnamed: 0,connect date,diffmins
0,2018-04-25,18212
1,2018-04-26,22092


In [None]:
df = df.groupby('connect date', as_index=False)['diffmins'].sum()
df.head(2)

Unnamed: 0,connect date,diffmins
0,2018-04-25,18212
1,2018-04-26,22092


In [None]:
df.shape

(614, 2)

In [None]:
date = df['connect date'].to_numpy()
duration = df['diffmins'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(Timesteps, Aqi, test_size = 0.08)

split_size = int(0.8*len(duration))

X_train,y_train = date[:split_size],duration[:split_size]
X_test,y_test = date[split_size:],duration[split_size:]

In [None]:
HORIZON = 1
WINDOW_SIZE = 7

In [None]:
def get_labelled_windows(x, horizon=1):
  return x[:, :-horizon], x[:, -horizon:]

In [None]:
def make_windows(x, window_size=7, horizon=1):
  window_step = np.expand_dims(np.arange(window_size+horizon), axis=0)
  window_indexes = window_step + np.expand_dims(np.arange(len(x)-(window_size+horizon-1)), axis=0).T
  windowed_array = x[window_indexes]

  windows, labels = get_labelled_windows(windowed_array, horizon=horizon)

  return windows, labels

In [None]:
full_windows, full_labels = make_windows(duration, window_size=WINDOW_SIZE, horizon=HORIZON)

len(full_windows), len(full_labels)

(607, 607)

In [None]:
for i in range(3):
  print(f"Window: {full_windows[i]} -> Label: {full_labels[i]}")

Window: [18212 22092 17915 16990 11718 19576 23712] -> Label: [26817]
Window: [22092 17915 16990 11718 19576 23712 26817] -> Label: [31076]
Window: [17915 16990 11718 19576 23712 26817 31076] -> Label: [28924]


In [None]:
def make_train_test_splits(windows, labels, test_split=0.2):

  split_size = int(len(windows) * (1-test_split)) # this will default to 80% train/20% test
  train_windows = windows[:split_size]
  train_labels = labels[:split_size]
  test_windows = windows[split_size:]
  test_labels = labels[split_size:]

  return train_windows, test_windows, train_labels, test_labels

In [None]:
train_windows, test_windows, train_labels, test_labels = make_train_test_splits(full_windows, full_labels)

len(train_windows), len(test_windows), len(train_labels), len(test_labels)

(485, 122, 485, 122)

In [None]:
train_windows[:5]

array([[18212, 22092, 17915, 16990, 11718, 19576, 23712],
       [22092, 17915, 16990, 11718, 19576, 23712, 26817],
       [17915, 16990, 11718, 19576, 23712, 26817, 31076],
       [16990, 11718, 19576, 23712, 26817, 31076, 28924],
       [11718, 19576, 23712, 26817, 31076, 28924, 20153]])

In [None]:
len(X_train),len(y_train),len(X_test),len(y_test)

(491, 491, 123, 123)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

XgBoost

In [None]:
import xgboost as xgb
regressor = xgb.XGBRegressor(
    n_estimators=200,
    reg_lambda=1,
    gamma=1,
    max_depth=1
)
regressor.fit(train_windows, train_labels)

Random Forest

In [None]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor1 = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor1.fit(train_windows, train_labels)

  regressor1.fit(train_windows, train_labels)


In [None]:
y_pred = regressor.predict(test_windows)
y_pred

array([14444.481 , 10563.557 , 14574.742 , 15681.512 , 16811.35  ,
       16760.912 , 16935.877 , 15034.36  , 11035.644 , 11185.959 ,
       15046.746 , 16496.318 , 17106.223 , 16141.116 , 16897.271 ,
       14227.164 , 15504.386 , 16286.287 , 15861.55  , 16086.478 ,
       15664.923 , 15938.152 , 15147.952 , 14851.426 , 16864.107 ,
       16315.703 , 15224.803 , 15507.771 , 14894.893 , 16141.116 ,
       17106.06  , 15164.541 , 16221.466 , 16336.173 , 16448.69  ,
       15490.614 , 14607.265 , 13664.211 , 14721.267 , 14794.174 ,
       14909.014 , 15436.398 , 16026.521 , 15508.018 , 15271.672 ,
       14846.52  , 15148.198 , 14794.174 , 14721.267 , 16313.193 ,
       16740.125 , 15861.796 , 15271.672 , 14558.074 , 15434.864 ,
       16378.522 , 16666.973 , 16378.522 , 15769.317 , 15788.643 ,
       16566.27  , 16566.27  , 16566.27  , 15147.952 , 15664.923 ,
       15311.145 , 15498.892 , 16666.973 , 15436.398 , 15148.198 ,
       14204.296 , 15271.672 , 16077.089 , 16451.68  , 15271.6

In [None]:
viz_train = go.Scatter(
    x = X_train,
    y = y_train,
    mode = 'lines',
    name = 'Actual Train'
)

viz_test = go.Scatter(
    x = X_test,
    y = y_test,
    mode = 'lines',
    name = 'Actual Test'
)

layout = go.Layout(
    title = 'Session duration',
    xaxis = {'title' : "Date"},
    yaxis = {'title' : "session"}
)
fig = go.Figure(data=[viz_train, viz_test], layout=layout)

fig.show()

In [None]:
trace1 = go.Scatter(
    x = X_test,
    y = test_labels[:, 0],
    mode = 'lines',
    name = 'Actual Test'
)

trace2 = go.Scatter(
    x = X_test,
    y = y_pred,
    mode = 'lines',
    name = 'Predicted Test'
)

layout = go.Layout(
    title = 'Session Duration',
    xaxis = {'title' : "Date"},
    yaxis = {'title' : "duration"}
)
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

In [None]:
import tensorflow as tf

In [None]:
def evaluate_preds(y_true, y_pred):
  # Make sure float32 (for metric calculations)
  y_true = tf.cast(y_true, dtype=tf.float32)
  y_pred = tf.cast(y_pred, dtype=tf.float32)

  # Calculate various metrics
  mae = tf.keras.metrics.mean_absolute_error(y_true, y_pred)
  mse = tf.keras.metrics.mean_squared_error(y_true, y_pred) # puts and emphasis on outliers (all errors get squared)
  rmse = tf.sqrt(mse)
  mape = tf.keras.metrics.mean_absolute_percentage_error(y_true, y_pred)
  #mase = mean_absolute_scaled_error(y_true, y_pred)

  return {"mae": mae.numpy(),
          "mse": mse.numpy(),
          "rmse": rmse.numpy(),
          "mape": mape.numpy()}
          #"mase": mase.numpy()}

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_percentage_error(test_labels, y_pred)

0.15237327881963206

In [None]:
#from google.colab import files
#df.to_csv('2018-2019duplicate.csv')
#files.download('2018-2019duplicate.csv')

In [None]:
FUTURE = 10

def make_future_forecast(values, model, into_future, window_size=WINDOW_SIZE) -> list:
  future_forecast = []
  last_window = values[-WINDOW_SIZE:]

  for _ in range(into_future):
    print(last_window)

    future_pred = model.predict(tf.expand_dims(last_window, axis=0))
    #future_pred = regressor.predict(last_window.reshape(-1,1))

    print(f"Predicting on: \n {last_window} -> Prediction: {tf.squeeze(future_pred).numpy()}\n")

    future_forecast.append(tf.squeeze(future_pred).numpy())

    last_window = np.append(last_window, future_pred)[-WINDOW_SIZE:]

  return future_forecast

In [None]:
 future_forecast = make_future_forecast(values=duration,
                                        model=regressor,
                                        into_future=FUTURE,
                                        window_size=WINDOW_SIZE)

[10875 10764 10546  9854 10764  9423 10679]
Predicting on: 
 [10875 10764 10546  9854 10764  9423 10679] -> Prediction: 15923.4384765625

[10764.         10546.          9854.         10764.
  9423.         10679.         15923.43847656]
Predicting on: 
 [10764.         10546.          9854.         10764.
  9423.         10679.         15923.43847656] -> Prediction: 16148.36328125

[10546.          9854.         10764.          9423.
 10679.         15923.43847656 16148.36328125]
Predicting on: 
 [10546.          9854.         10764.          9423.
 10679.         15923.43847656 16148.36328125] -> Prediction: 15527.896484375

[ 9854.         10764.          9423.         10679.
 15923.43847656 16148.36328125 15527.89648438]
Predicting on: 
 [ 9854.         10764.          9423.         10679.
 15923.43847656 16148.36328125 15527.89648438] -> Prediction: 11734.529296875

[10764.          9423.         10679.         15923.43847656
 16148.36328125 15527.89648438 11734.52929688]
Predicti

In [None]:
future_forecast[:]

[15923.438,
 16148.363,
 15527.896,
 11734.529,
 13931.11,
 9952.189,
 15209.938,
 16071.164,
 17463.717,
 15728.136]

In [None]:
def get_future_dates(start_date, into_future, offset=1):
   start_date = start_date + np.timedelta64(offset, "D")
   end_date = start_date + np.timedelta64(into_future, "D")

   return np.arange(start_date, end_date, dtype="datetime64[D]")

In [None]:
last_timestep = df['connect date'].iloc[-1]

last_timestep

Timestamp('2019-12-31 00:00:00')

RF Forecast

In [None]:
future_forecast_rf = make_future_forecast(values=duration,
                                          model=regressor1,
                                          into_future=FUTURE,
                                          window_size=WINDOW_SIZE)

[10875 10764 10546  9854 10764  9423 10679]
Predicting on: 
 [10875 10764 10546  9854 10764  9423 10679] -> Prediction: 10468.0

[10764. 10546.  9854. 10764.  9423. 10679. 10468.]
Predicting on: 
 [10764. 10546.  9854. 10764.  9423. 10679. 10468.] -> Prediction: 10706.9

[10546.   9854.  10764.   9423.  10679.  10468.  10706.9]
Predicting on: 
 [10546.   9854.  10764.   9423.  10679.  10468.  10706.9] -> Prediction: 8853.5

[ 9854.  10764.   9423.  10679.  10468.  10706.9  8853.5]
Predicting on: 
 [ 9854.  10764.   9423.  10679.  10468.  10706.9  8853.5] -> Prediction: 10706.9

[10764.   9423.  10679.  10468.  10706.9  8853.5 10706.9]
Predicting on: 
 [10764.   9423.  10679.  10468.  10706.9  8853.5 10706.9] -> Prediction: 8853.5

[ 9423.  10679.  10468.  10706.9  8853.5 10706.9  8853.5]
Predicting on: 
 [ 9423.  10679.  10468.  10706.9  8853.5 10706.9  8853.5] -> Prediction: 11392.5

[10679.  10468.  10706.9  8853.5 10706.9  8853.5 11392.5]
Predicting on: 
 [10679.  10468.  10706.9  8

In [None]:
next_time_steps = get_future_dates(start_date=last_timestep, into_future=FUTURE)

next_time_steps

array(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
       '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
       '2020-01-09', '2020-01-10'], dtype='datetime64[D]')

In [None]:
 next_time_steps = np.insert(next_time_steps, 0, last_timestep)
 future_forecast = np.insert(future_forecast, 0, duration[-1])

 next_time_steps, future_forecast

(array(['2019-12-31', '2020-01-01', '2020-01-02', '2020-01-03',
        '2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07',
        '2020-01-08', '2020-01-09', '2020-01-10'], dtype='datetime64[D]'),
 array([10679.   , 15923.438, 16148.363, 15527.896, 11734.529, 13931.11 ,
         9952.189, 15209.938, 16071.164, 17463.717, 15728.136],
       dtype=float32))

In [None]:
 next_time_steps = np.insert(next_time_steps, 0, last_timestep)
 future_forecast_rf = np.insert(future_forecast_rf, 0, duration[-1])

 next_time_steps, future_forecast_rf

(array(['2019-12-31', '2019-12-31', '2020-01-01', '2020-01-02',
        '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-06',
        '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10'],
       dtype='datetime64[D]'),
 array([10679. , 10468. , 10706.9,  8853.5, 10706.9,  8853.5, 11392.5,
         9904.2, 11352.1,  9324.4, 12395.5]))

In [None]:
trace1 = go.Scatter(
    x = date,
    y = duration,
    mode = 'lines',
    name = 'Actual'
)

trace2 = go.Scatter(
    x = next_time_steps,
    y = future_forecast,
    mode = 'lines',
    name = 'Forecasted XgB'
)

trace3 = go.Scatter(
    x = next_time_steps,
    y = future_forecast_rf,
    mode = 'lines',
    name = 'Forecasted RF'
)

layout = go.Layout(
    title = 'Session Duration',
    xaxis = {'title' : "Date"},
    yaxis = {'title' : "Duration"}
)

fig = go.Figure(data=[trace1, trace2,trace3], layout=layout)

fig.update_layout(
    legend=dict(
        x=.2,
        y=.8,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=16,
            color="black"
        ),
    )
)

fig.show()