# Utilization Prediction: Prediction of hourly utilization of the two sites

## Import libraries

In [140]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import os
os.environ["KERAS_BACKEND"] = "torch"

import keras
from keras import Sequential
from keras.layers import Dense

%matplotlib inline

## Load and prepare data

In [158]:
# Load data
charging_data = pd.read_pickle("data/charging_modified.pkl")
weather_data = pd.read_csv("data/weather_modified.csv")

# TODO: more filtering of features
# drop not important columns in weather data
weather_data.drop(columns = ["city", "cloud_cover_description", "pressure", "felt_temperature"], inplace = True)
weather_data["timestamp"] = pd.to_datetime(weather_data["timestamp"], utc = True).dt.floor(freq = "H")

# drop not important columns in charging data
charging_data.drop(columns = ["kWhPerMinute", "NoChargingTime", "user_kWhRequested", "user_modifiedAt", "doneChargingTime", "NoChargingTimeMinutes", "spaceID", "stationID", 
                              "user_userID", "ChargingTime", "user_paymentRequired_values"], inplace = True)

# Delete rows with null values
charging_data = charging_data.dropna()

# delete duplicate row index
charging_data.reset_index(drop = True, inplace = True)

charging_data.astype({
    "siteID": "category"    
})

# check user_paymentRequired_values again?

# create dummy variables for user_paymentRequired_values column
charging_data = pd.get_dummies(charging_data, columns = ["siteID"], dtype = "int64")
charging_data.head()

Unnamed: 0,id,connectionTime,disconnectTime,kWhDelivered,userID,ChargingTimeMinutes,month,user_requestedDeparture,user_minutesAvailable,user_milesRequested,user_WhPerMile,siteID_1,siteID_2
0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,25.016,194.0,262.683333,1,"Thu, 02 Jan 2020 20:51:54 GMT",463.0,100.0,250.0,1,0
1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,33.097,4275.0,401.25,1,"Thu, 02 Jan 2020 23:31:50 GMT",595.0,250.0,280.0,1,0
2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,6.521,344.0,158.516667,1,"Thu, 02 Jan 2020 14:56:35 GMT",60.0,20.0,400.0,1,0
3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2.355,1117.0,78.783333,1,"Thu, 02 Jan 2020 15:04:58 GMT",65.0,20.0,400.0,1,0
4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,13.375,334.0,257.483333,1,"Thu, 02 Jan 2020 22:24:01 GMT",504.0,40.0,400.0,1,0


## Create feature vector X and labels Y

In [159]:
# split time interval between connectionTime and disconnectTime in minutes for every hour
def split_time_interval(index, start_date, end_date):
    indices = pd.DatetimeIndex([start_date])
    indices = indices.append(pd.date_range(start_date.ceil(freq = "H"), end_date.floor(freq = "H"), freq = "H"))
    indices = indices.append(pd.DatetimeIndex([end_date]))

    time_list = list(map(lambda x : 60 if x == 0 else x, indices.minute))
    time_list[0] = 60 - time_list[0]
    
    return list((time_list, indices))


# creates table with session id, charging time in minutes per hour and timestamps
def create_minute_table(id, minute_array):
    return list(zip([id] * len(minute_array[0]), minute_array[0], minute_array[1]))

In [160]:
# concat single minute lists to one big list with lists of tuples containing id, charging times in minutes per hour and timestamps
temp = []

for index in charging_data.index :
    charging_times = split_time_interval(charging_data.index[index], charging_data.loc[index, "connectionTime"], charging_data.loc[index, "disconnectTime"])
    temp.append(create_minute_table(charging_data.loc[index, "id"], charging_times))

In [161]:
# parse list of lists of triples into list of three lists: ids, charging time in minutes per hour and timestamps
ids, charging_times, timestamps = map(list, zip(*[tuple for list in temp for tuple in list]))

# create data.frame with three columns: session id, charging time and starting timestamp in minutes for every hour
charging_time_per_hour = pd.DataFrame(
                            {"id": ids,
                             "charging_time_per_hour": charging_times,
                             "timestamp": timestamps})
charging_time_per_hour.loc[:, "timestamp"] = charging_time_per_hour.loc[:, "timestamp"].dt.floor(freq = "H")

# join charge time per hour with the rest of the charging data
X= charging_data.merge(charging_time_per_hour, how = "outer", on = "id").drop(["connectionTime", "disconnectTime", "user_requestedDeparture", "id", "userID", "charging_time_per_hour", "month"], axis = 1)

# group columns by timestamps
X = X_temp.groupby("timestamp").agg({"kWhDelivered": "sum",
                                     "ChargingTimeMinutes": "sum",
                                     "user_minutesAvailable": "sum",
                                     "user_milesRequested": "sum",
                                     "user_WhPerMile": "mean"
                                         })

# extract months from timestamp
X["month"] = list(map(lambda x: x.month, X.index))

# add weather data to feature vector
X = X.merge(weather_data, how = "inner", on = "timestamp")
X.reset_index(drop = True, inplace = True)

# roughly 50 EV charging stations per site * 60 minutes
maximum_utilization = 50 * 60
Y = list(map(lambda x : maximum_utilization - x, charging_time_per_hour.groupby("timestamp").sum().loc[:, "charging_time_per_hour"]))
X

Unnamed: 0,timestamp,kWhDelivered,ChargingTimeMinutes,user_minutesAvailable,user_milesRequested,user_WhPerMile,month,temperature,cloud_cover,windspeed,precipitation
0,2018-04-30 15:00:00+00:00,47.808,560.033333,550.0,170.0,350.0,4,14.0,26.0,13.0,0.0
1,2018-04-30 15:00:00+00:00,47.808,560.033333,550.0,170.0,350.0,4,14.0,26.0,9.0,0.0
2,2018-04-30 16:00:00+00:00,47.808,560.033333,550.0,170.0,350.0,4,16.0,28.0,11.0,0.0
3,2018-04-30 17:00:00+00:00,47.808,560.033333,550.0,170.0,350.0,4,16.0,28.0,9.0,0.0
4,2018-04-30 17:00:00+00:00,47.808,560.033333,550.0,170.0,350.0,4,16.0,26.0,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17310,2020-12-31 17:00:00+00:00,10.578,91.333333,960.0,200.0,283.0,12,13.0,34.0,22.0,0.0
17311,2020-12-31 18:00:00+00:00,10.571,202.933333,480.0,40.0,350.0,12,16.0,34.0,26.0,0.0
17312,2020-12-31 19:00:00+00:00,10.571,202.933333,480.0,40.0,350.0,12,17.0,34.0,19.0,0.0
17313,2020-12-31 20:00:00+00:00,10.571,202.933333,480.0,40.0,350.0,12,18.0,34.0,26.0,0.0


## Cross-validation: split data in training and test set