In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

In [24]:
df = pd.read_csv("cleanData/cleanChargingDataFull.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture,temperature,cloud_cover,precipitation
0,0,51323,2020-11-18 15:36:26+00:00,2020-11-18 16:02:37+00:00,,4.816,2,11900388,2-39-81-4550,7132.0,0 days 00:26:11,274.0,8.22,30.0,480.0,True,"Thu, 19 Nov 2020 07:36:26 GMT",13.15,27.46,0.0
1,1,51324,2020-11-18 16:35:54+00:00,2020-11-18 17:31:08+00:00,,10.027,2,11900388,2-39-81-4550,4903.0,0 days 00:55:14,258.0,51.6,200.0,576.0,True,"Thu, 19 Nov 2020 10:11:54 GMT",13.15,27.46,0.0
2,2,51325,2020-11-18 17:34:02+00:00,2020-11-18 18:45:14+00:00,,24.486,2,11900388,2-39-81-4550,4903.0,0 days 01:11:12,258.0,51.6,200.0,576.0,True,"Thu, 19 Nov 2020 11:10:02 GMT",13.15,27.46,0.0
3,3,51328,2020-11-18 19:52:00+00:00,2020-11-18 20:00:50+00:00,,4.788,2,11900388,2-39-81-4550,1085.0,0 days 00:08:50,283.0,56.6,200.0,589.0,True,"Thu, 19 Nov 2020 13:41:00 GMT",13.15,27.46,0.0
4,4,51329,2020-11-18 20:24:11+00:00,2020-11-18 21:07:15+00:00,,30.849,2,11900388,2-39-81-4550,9284.0,0 days 00:43:04,400.0,40.0,100.0,30.0,True,"Thu, 19 Nov 2020 04:54:11 GMT",13.15,27.46,0.0


We dont need the columns Unnamed or id. We also need to transform the columns connectionTime, disconnectTime, doneCHargingTime, stationID, parkDuration and requestedDeparture; so our model can work with those features.

In [25]:
df = df.drop("Unnamed: 0", axis=1)
df = df.drop("id", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      66450 non-null  object 
 1   disconnectTime      66450 non-null  object 
 2   doneChargingTime    62362 non-null  object 
 3   kWhDelivered        66450 non-null  float64
 4   siteID              66450 non-null  int64  
 5   spaceID             66450 non-null  object 
 6   stationID           66450 non-null  object 
 7   userID_x            49187 non-null  float64
 8   parkDuration        66450 non-null  object 
 9   WhPerMile           49187 non-null  float64
 10  kWhRequested        49187 non-null  float64
 11  milesRequested      49187 non-null  float64
 12  minutesAvailable    49187 non-null  float64
 13  paymentRequired     49187 non-null  object 
 14  requestedDeparture  49187 non-null  object 
 15  temperature         57547 non-null  float64
 16  clou

We start by transforming the time columns connectionTime, disconnectTime, doneChargingTime, parkDuration and requestedDeparture to datetime/timedelta and then to floats/ints.

In [26]:
df["connectionTime"] = pd.to_datetime(df["connectionTime"])
df["disconnectTime"] = pd.to_datetime(df["disconnectTime"])
df["doneChargingTime"] = pd.to_datetime(df["doneChargingTime"])
df["parkDuration"] = pd.to_timedelta(df["parkDuration"])
df["requestedDeparture"] = pd.to_datetime(df["requestedDeparture"])

We are creating a new  feature called Weekday, where Monday is 0 and Sunday is 6. It will be the weekday of the connectionTime. How we work with it in the NN will be done later

In [27]:
df["weekday"] = df["connectionTime"].dt.weekday
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture,temperature,cloud_cover,precipitation,weekday
0,2020-11-18 15:36:26+00:00,2020-11-18 16:02:37+00:00,NaT,4.816,2,11900388,2-39-81-4550,7132.0,0 days 00:26:11,274.0,8.22,30.0,480.0,True,2020-11-19 07:36:26+00:00,13.15,27.46,0.0,2
1,2020-11-18 16:35:54+00:00,2020-11-18 17:31:08+00:00,NaT,10.027,2,11900388,2-39-81-4550,4903.0,0 days 00:55:14,258.0,51.6,200.0,576.0,True,2020-11-19 10:11:54+00:00,13.15,27.46,0.0,2
2,2020-11-18 17:34:02+00:00,2020-11-18 18:45:14+00:00,NaT,24.486,2,11900388,2-39-81-4550,4903.0,0 days 01:11:12,258.0,51.6,200.0,576.0,True,2020-11-19 11:10:02+00:00,13.15,27.46,0.0,2
3,2020-11-18 19:52:00+00:00,2020-11-18 20:00:50+00:00,NaT,4.788,2,11900388,2-39-81-4550,1085.0,0 days 00:08:50,283.0,56.6,200.0,589.0,True,2020-11-19 13:41:00+00:00,13.15,27.46,0.0,2
4,2020-11-18 20:24:11+00:00,2020-11-18 21:07:15+00:00,NaT,30.849,2,11900388,2-39-81-4550,9284.0,0 days 00:43:04,400.0,40.0,100.0,30.0,True,2020-11-19 04:54:11+00:00,13.15,27.46,0.0,2


The datetime columns will be convertet to floats that represent the fraction of the day that has passed (for example 6am would be 0.25)

In [28]:
def get_time_day(args):
  time_min = args.minute
  time_min = time_min+args.hour*60
  time_min = time_min+args.second/60
  time_day = time_min/(60*24)
  return time_day


df["connectionTime"] = df["connectionTime"].apply(lambda x: get_time_day(x))
df["disconnectTime"] = df["disconnectTime"].apply(lambda x: get_time_day(x))
df["doneChargingTime"] = df["doneChargingTime"].apply(lambda x: get_time_day(x))
df["requestedDeparture"] = df["requestedDeparture"].apply(lambda x: get_time_day(x))

df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture,temperature,cloud_cover,precipitation,weekday
0,0.650301,0.668484,,4.816,2,11900388,2-39-81-4550,7132.0,0 days 00:26:11,274.0,8.22,30.0,480.0,True,0.316968,13.15,27.46,0.0,2
1,0.691597,0.729954,,10.027,2,11900388,2-39-81-4550,4903.0,0 days 00:55:14,258.0,51.6,200.0,576.0,True,0.424931,13.15,27.46,0.0,2
2,0.731968,0.781412,,24.486,2,11900388,2-39-81-4550,4903.0,0 days 01:11:12,258.0,51.6,200.0,576.0,True,0.465301,13.15,27.46,0.0,2
3,0.827778,0.833912,,4.788,2,11900388,2-39-81-4550,1085.0,0 days 00:08:50,283.0,56.6,200.0,589.0,True,0.570139,13.15,27.46,0.0,2
4,0.850127,0.880035,,30.849,2,11900388,2-39-81-4550,9284.0,0 days 00:43:04,400.0,40.0,100.0,30.0,True,0.204294,13.15,27.46,0.0,2


Now we transform parkDuration into minutes

In [29]:
df["parkDuration"] = df["parkDuration"].dt.total_seconds()
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture,temperature,cloud_cover,precipitation,weekday
0,0.650301,0.668484,,4.816,2,11900388,2-39-81-4550,7132.0,1571.0,274.0,8.22,30.0,480.0,True,0.316968,13.15,27.46,0.0,2
1,0.691597,0.729954,,10.027,2,11900388,2-39-81-4550,4903.0,3314.0,258.0,51.6,200.0,576.0,True,0.424931,13.15,27.46,0.0,2
2,0.731968,0.781412,,24.486,2,11900388,2-39-81-4550,4903.0,4272.0,258.0,51.6,200.0,576.0,True,0.465301,13.15,27.46,0.0,2
3,0.827778,0.833912,,4.788,2,11900388,2-39-81-4550,1085.0,530.0,283.0,56.6,200.0,589.0,True,0.570139,13.15,27.46,0.0,2
4,0.850127,0.880035,,30.849,2,11900388,2-39-81-4550,9284.0,2584.0,400.0,40.0,100.0,30.0,True,0.204294,13.15,27.46,0.0,2


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      66450 non-null  float64
 1   disconnectTime      66450 non-null  float64
 2   doneChargingTime    62362 non-null  float64
 3   kWhDelivered        66450 non-null  float64
 4   siteID              66450 non-null  int64  
 5   spaceID             66450 non-null  object 
 6   stationID           66450 non-null  object 
 7   userID_x            49187 non-null  float64
 8   parkDuration        66450 non-null  float64
 9   WhPerMile           49187 non-null  float64
 10  kWhRequested        49187 non-null  float64
 11  milesRequested      49187 non-null  float64
 12  minutesAvailable    49187 non-null  float64
 13  paymentRequired     49187 non-null  object 
 14  requestedDeparture  49187 non-null  float64
 15  temperature         57547 non-null  float64
 16  clou

Lastly we need to convert spaceID and stationID into integer

We will create integers for each spaceID and stationID and use them as categorical data.



In [37]:
#Count all unique values, then map each to an integer, starting at 1 and then mapping this into the df
spaceIDs = df["spaceID"].unique()
spaceIDMap = {value: idx + 1 for idx, value in enumerate(spaceIDs)}
df["spaceID"] = df["spaceID"].map(spaceIDMap)
stationIDs = df["stationID"].unique()
stationIDMap = {value: idx + 1 for idx, value in enumerate(stationIDs)}
df["stationID"] = df["stationID"].map(stationIDMap)
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture,temperature,cloud_cover,precipitation,weekday
0,0.650301,0.668484,,4.816,2,1,1,7132.0,1571.0,274.0,8.22,30.0,480.0,True,0.316968,13.15,27.46,0.0,2
1,0.691597,0.729954,,10.027,2,1,1,4903.0,3314.0,258.0,51.6,200.0,576.0,True,0.424931,13.15,27.46,0.0,2
2,0.731968,0.781412,,24.486,2,1,1,4903.0,4272.0,258.0,51.6,200.0,576.0,True,0.465301,13.15,27.46,0.0,2
3,0.827778,0.833912,,4.788,2,1,1,1085.0,530.0,283.0,56.6,200.0,589.0,True,0.570139,13.15,27.46,0.0,2
4,0.850127,0.880035,,30.849,2,1,1,9284.0,2584.0,400.0,40.0,100.0,30.0,True,0.204294,13.15,27.46,0.0,2


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      66450 non-null  float64
 1   disconnectTime      66450 non-null  float64
 2   doneChargingTime    62362 non-null  float64
 3   kWhDelivered        66450 non-null  float64
 4   siteID              66450 non-null  int64  
 5   spaceID             66450 non-null  int64  
 6   stationID           66450 non-null  int64  
 7   userID_x            49187 non-null  float64
 8   parkDuration        66450 non-null  float64
 9   WhPerMile           49187 non-null  float64
 10  kWhRequested        49187 non-null  float64
 11  milesRequested      49187 non-null  float64
 12  minutesAvailable    49187 non-null  float64
 13  paymentRequired     0 non-null      float64
 14  requestedDeparture  49187 non-null  float64
 15  temperature         57547 non-null  float64
 16  clou

ToDO

create feature isWeekDay (need to extract before transforming the time data columns) DONE

transform parkDuration into minutes DONE


Rescale the data                        
create 2 dataframes, one for each site (makes sense economically and seeing the descriptive plots will very likely improve the predictive power)

perform feature selection (based on the impact of each feature)

split data into train/validation/test sets

throw the training data into the NN and see how it performs

adapt hyperparameters

evaluate model