# Utilization Prediction: Prediction of hourly utilization of the two sites

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import os
os.environ["KERAS_BACKEND"] = "torch"

import keras
from keras import Sequential
from keras.layers import Dense

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline

## Load and prepare data

In [2]:
# Load data
charging_data = pd.read_pickle("data/charging_modified.pkl")
weather_data = pd.read_csv("data/weather_modified.csv")

# TODO: more filtering of features
# drop not important columns in weather data
weather_data.drop(columns = ["city", "cloud_cover_description", "pressure", "felt_temperature"], inplace = True)
weather_data["timestamp"] = pd.to_datetime(weather_data["timestamp"], utc = True).dt.floor(freq = "H")

# drop not important columns in charging data
charging_data.drop(columns = ["kWhPerMinute", "NoChargingTime", "user_kWhRequested", "user_modifiedAt", "doneChargingTime", "NoChargingTimeMinutes", "spaceID", "stationID", 
                              "user_userID", "ChargingTime", "user_paymentRequired_values"], inplace = True)

# Delete rows with null values
charging_data = charging_data.dropna()

# delete duplicate row index
charging_data.reset_index(drop = True, inplace = True)

charging_data = charging_data.astype({
    "kWhDelivered": "int64",
    "ChargingTimeMinutes": "int64",
    "user_minutesAvailable": "int64",
    "user_milesRequested": "int64",
    "user_WhPerMile": "int64",
    "siteID": "int64"
})

weather_data.astype({
    "temperature": "float",
    "cloud_cover": "float",
    "windspeed": "float",
    "precipitation": "float"
})

# check user_paymentRequired_values again

Unnamed: 0,timestamp,temperature,cloud_cover,windspeed,precipitation
0,2018-01-01 08:00:00+00:00,9.0,33.0,9.0,0.0
1,2018-01-01 09:00:00+00:00,9.0,33.0,0.0,0.0
2,2018-01-01 10:00:00+00:00,9.0,21.0,0.0,0.0
3,2018-01-01 11:00:00+00:00,9.0,29.0,0.0,0.0
4,2018-01-01 12:00:00+00:00,8.0,33.0,0.0,0.0
...,...,...,...,...,...
29239,2021-01-01 03:00:00+00:00,13.0,33.0,0.0,0.0
29240,2021-01-01 04:00:00+00:00,12.0,33.0,11.0,0.0
29241,2021-01-01 05:00:00+00:00,12.0,33.0,9.0,0.0
29242,2021-01-01 06:00:00+00:00,11.0,33.0,13.0,0.0


## Create feature vector X and labels Y

In [3]:
# split time interval between connectionTime and disconnectTime in minutes for every hour
def split_time_interval(index, start_date, end_date):
    indices = pd.DatetimeIndex([start_date])
    indices = indices.append(pd.date_range(start_date.ceil(freq = "H"), end_date.floor(freq = "H"), freq = "H"))
    indices = indices.append(pd.DatetimeIndex([end_date]))

    time_list = list(map(lambda x : 60 if x == 0 else x, indices.minute))
    time_list[0] = 60 - time_list[0]
    
    return list((time_list, indices))


# creates table with session id, charging time in minutes per hour and timestamps
def create_minute_table(id, minute_array, siteID):
    return list(zip([id] * len(minute_array[0]), minute_array[0], minute_array[1], [siteID] * len(minute_array[0])))

# groups entires by timestamp
def group_by_timestamp(X):
    X = X.groupby("timestamp").agg({"kWhDelivered": "sum",
                                     "ChargingTimeMinutes": "sum",
                                     "siteID": "mean",
                                     "month": "mean",
                                     "user_minutesAvailable": "sum",
                                     "user_milesRequested": "sum",
                                     "user_WhPerMile": "mean"
                               })
    return X

In [4]:
# concat single minute lists to one big list with lists of tuples containing id, charging times in minutes per hour and timestamps
temp = []

for index in charging_data.index :
    charging_times = split_time_interval(charging_data.index[index], 
                                         charging_data.loc[index, "connectionTime"], 
                                         charging_data.loc[index, "disconnectTime"])
    temp.append(create_minute_table(charging_data.loc[index, "id"], charging_times, charging_data.loc[index, "siteID"]))
    
charging_data.drop("siteID", inplace = True, axis = 1)

# parse list of lists of triples into list of three lists: ids, charging time in minutes per hour and timestamps
ids, charging_times, timestamps, siteIDs = map(list, zip(*[tuple for list in temp for tuple in list]))

# create data.frame with four columns: session id, charging time, starting timestamp in minutes for every hour and siteID
# TODO: id kinda useless
charging_time_per_hour = pd.DataFrame(
                            {"id": ids,
                             "charging_time_per_hour": charging_times,
                             "timestamp": timestamps,
                             "siteID": siteIDs})
charging_time_per_hour.loc[:, "timestamp"] = charging_time_per_hour.loc[:, "timestamp"].dt.floor(freq = "H")

### Labels Y:

In [5]:
# roughly 50 EV charging stations per site * 60 minutes
maximum_utilization = 50 * 60

# divide data set into two sets: one for each site
Y1 = charging_time_per_hour.loc[charging_time_per_hour["siteID"] == 1]
Y2 = charging_time_per_hour.loc[charging_time_per_hour["siteID"] == 2]

print(len(Y1))
print(len(Y2))

Y1 = pd.Series(map(lambda x : x / maximum_utilization, Y1.groupby("timestamp").sum().loc[:, "charging_time_per_hour"]))
Y2 = pd.Series(map(lambda x : x / maximum_utilization, Y2.groupby("timestamp").sum().loc[:, "charging_time_per_hour"]))

Y = pd.concat([Y1, Y2], axis = 0)
Y

281798
118826


0        0.021000
1        0.174667
2        0.300000
3        0.300000
4        0.300000
           ...   
16250    0.271000
16251    0.267000
16252    0.251333
16253    0.209667
16254    0.027333
Length: 31201, dtype: float64

### Feature Vector X:

In [6]:
# join charge time per hour with the rest of the charging data
X_temp = charging_time_per_hour.merge(charging_data, how = "left", on = "id").drop(["connectionTime", "disconnectTime", "user_requestedDeparture", "id", "userID", "charging_time_per_hour"], axis = 1)

# divide data set into two sets: one for each site
X1 = X_temp.loc[X_temp["siteID"] == 1]
X2 = X_temp.loc[X_temp["siteID"] == 2]

# group columns by timestamps
X1 = group_by_timestamp(X1)
X2 = group_by_timestamp(X2)

# combine both subsets together into one
X = pd.concat([X1, X2], axis = 0)
X = X.astype({"month": "int64",
              "siteID": "int64"})

# extract months from timestamp
X["month"] = list(map(lambda x: x.month, X.index))

# add weather data to feature vector
X = X.merge(weather_data, how = "left", on = "timestamp").drop_duplicates(["timestamp", "siteID"]).fillna(0)
X = X.drop("timestamp", axis = 1)
X = pd.get_dummies(X, columns = ["siteID"])
X

Unnamed: 0,kWhDelivered,ChargingTimeMinutes,month,user_minutesAvailable,user_milesRequested,user_WhPerMile,temperature,cloud_cover,windspeed,precipitation,siteID_1,siteID_2
0,53,1860,10,1226,289,319.000000,13.0,34.0,0.0,0.0,True,False
1,247,8734,10,6806,1074,359.800000,14.0,28.0,0.0,0.0,True,False
4,247,8734,10,6806,1074,359.800000,17.0,30.0,0.0,0.0,True,False
5,247,8734,10,6806,1074,359.800000,18.0,34.0,6.0,0.0,True,False
6,247,8734,10,6806,1074,359.800000,20.0,34.0,7.0,0.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
34107,258,3166,9,5759,1066,318.071429,0.0,0.0,0.0,0.0,False,True
34108,303,3582,9,5975,1156,328.200000,0.0,0.0,0.0,0.0,False,True
34109,337,4023,9,5603,1177,320.600000,0.0,0.0,0.0,0.0,False,True
34110,158,2271,9,3766,880,331.846154,0.0,0.0,0.0,0.0,False,True


## Cross-validation: split data in training and test set

In [7]:
# Split data into training set and testing set nad normalize
scaler = StandardScaler()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.3, random_state = 30)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
type(Y_train)

pandas.core.series.Series

## Prediction model 1: Neural Network

In [8]:
# Build the neural network
network = Sequential()

# Add layers
network.add(Dense(input_shape = (12,), units = 15, activation = "relu"))
network.add(Dense(units = 15, activation = "relu"))
network.add(Dense(units = 1, activation = "sigmoid"))

In [9]:
# Compiling the neural network
network.compile(optimizer = "adam", 
                   loss = "mean_squared_error", 
                   metrics = ["mean_squared_error"])
network.summary()

In [10]:
# Fitting the Neural Network
network.fit(X_train, Y_train, batch_size=50, epochs=100)

Epoch 1/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0620 - mean_squared_error: 0.0620
Epoch 2/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0104 - mean_squared_error: 0.0104
Epoch 3/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0107 - mean_squared_error: 0.0107
Epoch 4/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0093 - mean_squared_error: 0.0093
Epoch 5/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0090 - mean_squared_error: 0.0090
Epoch 6/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0091 - mean_squared_error: 0.0091
Epoch 7/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0084 - mean_squared_error: 0.0084
Epoch 8/100
[1m437/437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/s

<keras.src.callbacks.history.History at 0x27ac457f1c0>

### Performance evaluation

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

mse_nn  = mean_squared_error(Y_test, network.predict(X_test))
mae_nn  = mean_absolute_error(Y_test, network.predict(X_test))
msle_nn = mean_squared_log_error(Y_test, network.predict(X_test))
print(mse_nn)
print(mae_nn)
print(msle_nn)

[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
0.008337793199207915
0.032110296527003965
0.0018782860018763071


## Prediction Model 2: Polynomial Regression

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# create polynomial features for training set
poly_features = PolynomialFeatures(degree = 3, interaction_only = False, include_bias = True)
X_poly_train = poly_features.fit_transform(X_train)

# create polynomial features for testing set
X_poly_test = poly_features.fit_transform(X_test)

In [13]:
# fit linear regression model
reg_model = LinearRegression()
reg_model.fit(X_poly_train, Y_train)

### Performance Evaluation

In [14]:
# Evaluate performance
mse_pr    = mean_squared_error(Y_test, reg_model.predict(X_poly_test))
mae_pr    = mean_absolute_error(Y_test, reg_model.predict(X_poly_test))
print(mse_pr)
print(mae_pr)

0.002598256424792001
0.02916174905241051
