In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split

In [16]:
df = pd.read_csv("cleanData/cleanChargingDataFewNull.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation
0,1149,52872,2018-10-08 14:44:47+00:00,2018-10-08 16:59:14+00:00,2018-10-08 16:56:20+00:00,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,,17.28,31.94,0.0
1,1150,52887,2018-10-09 05:08:55+00:00,2018-10-09 14:03:56+00:00,2018-10-09 09:33:24+00:00,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,,18.26,28.74,0.0
2,1151,52943,2018-10-09 14:26:40+00:00,2018-10-09 18:48:12+00:00,2018-10-09 18:28:29+00:00,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,"Wed, 10 Oct 2018 02:50:40 GMT",18.26,28.74,0.0
3,1152,52959,2018-10-10 05:15:02+00:00,2018-10-10 14:30:44+00:00,2018-10-10 09:56:11+00:00,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,,17.14,29.36,0.0
4,1153,53016,2018-10-10 14:52:56+00:00,2018-10-10 16:34:00+00:00,2018-10-10 16:11:04+00:00,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,,17.14,29.36,0.0


We dont need the columns Unnamed or id. We also need to transform the columns connectionTime, disconnectTime, doneCHargingTime, stationID, parkDuration and requestedDeparture; so our model can work with those features.

In [17]:
df = df.drop("Unnamed: 0", axis=1)
df = df.drop("id", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54913 entries, 0 to 54912
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      54913 non-null  object 
 1   disconnectTime      54913 non-null  object 
 2   doneChargingTime    54913 non-null  object 
 3   kWhDelivered        54913 non-null  float64
 4   siteID              54913 non-null  int64  
 5   spaceID             54913 non-null  object 
 6   stationID           54913 non-null  object 
 7   userID_x            38975 non-null  float64
 8   parkDuration        54913 non-null  object 
 9   WhPerMile           38975 non-null  float64
 10  kWhRequested        38975 non-null  float64
 11  milesRequested      38975 non-null  float64
 12  minutesAvailable    38975 non-null  float64
 13  requestedDeparture  38975 non-null  object 
 14  temperature         54913 non-null  float64
 15  cloud_cover         54913 non-null  float64
 16  prec

We start by transforming the time columns connectionTime, disconnectTime, doneChargingTime, parkDuration and requestedDeparture to datetime/timedelta and then to floats/ints.

In [18]:
df["connectionTime"] = pd.to_datetime(df["connectionTime"])
df["disconnectTime"] = pd.to_datetime(df["disconnectTime"])
df["doneChargingTime"] = pd.to_datetime(df["doneChargingTime"])
df["parkDuration"] = pd.to_timedelta(df["parkDuration"])
df["requestedDeparture"] = pd.to_datetime(df["requestedDeparture"])

We are creating a new  feature called Weekday, where Monday = 0 and Sunday = 6. It will be the weekday of the connectionTime.


Similarly we are creating a new feature month, where January = 1, Feburary = 2 and so on.

In [19]:
df["weekday"] = df["connectionTime"].dt.weekday
df["month"] = df["connectionTime"].dt.month
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month
0,2018-10-08 14:44:47+00:00,2018-10-08 16:59:14+00:00,2018-10-08 16:56:20+00:00,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,NaT,17.28,31.94,0.0,0,10
1,2018-10-09 05:08:55+00:00,2018-10-09 14:03:56+00:00,2018-10-09 09:33:24+00:00,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,NaT,18.26,28.74,0.0,1,10
2,2018-10-09 14:26:40+00:00,2018-10-09 18:48:12+00:00,2018-10-09 18:28:29+00:00,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,2018-10-10 02:50:40,18.26,28.74,0.0,1,10
3,2018-10-10 05:15:02+00:00,2018-10-10 14:30:44+00:00,2018-10-10 09:56:11+00:00,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,NaT,17.14,29.36,0.0,2,10
4,2018-10-10 14:52:56+00:00,2018-10-10 16:34:00+00:00,2018-10-10 16:11:04+00:00,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,NaT,17.14,29.36,0.0,2,10


The datetime columns will be convertet to floats that represent the fraction of the day that has passed (for example 6am would be 0.25)

In [20]:
def get_time_day(args):
  time_min = args.minute
  time_min = time_min+args.hour*60
  time_min = time_min+args.second/60
  time_day = time_min/(60*24)
  return time_day


df["connectionTime"] = df["connectionTime"].apply(lambda x: get_time_day(x))
df["disconnectTime"] = df["disconnectTime"].apply(lambda x: get_time_day(x))
df["doneChargingTime"] = df["doneChargingTime"].apply(lambda x: get_time_day(x))
df["requestedDeparture"] = df["requestedDeparture"].apply(lambda x: get_time_day(x))

df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month
0,0.614433,0.707801,0.705787,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,,17.28,31.94,0.0,0,10
1,0.214525,0.586065,0.398194,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,,18.26,28.74,0.0,1,10
2,0.601852,0.783472,0.76978,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10
3,0.218773,0.604676,0.414016,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,,17.14,29.36,0.0,2,10
4,0.620093,0.690278,0.674352,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,,17.14,29.36,0.0,2,10


Now we transform parkDuration into minutes

In [21]:
df["parkDuration"] = df["parkDuration"].dt.total_seconds()
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month
0,0.614433,0.707801,0.705787,10.208,1,AG-1F01,1-1-193-825,,8067.0,,,,,,17.28,31.94,0.0,0,10
1,0.214525,0.586065,0.398194,11.837,1,AG-1F01,1-1-193-825,,32101.0,,,,,,18.26,28.74,0.0,1,10
2,0.601852,0.783472,0.76978,12.034,1,AG-1F01,1-1-193-825,383.0,15692.0,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10
3,0.218773,0.604676,0.414016,11.985,1,AG-1F01,1-1-193-825,,33342.0,,,,,,17.14,29.36,0.0,2,10
4,0.620093,0.690278,0.674352,7.863,1,AG-1F01,1-1-193-825,,6064.0,,,,,,17.14,29.36,0.0,2,10


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54913 entries, 0 to 54912
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      54913 non-null  float64
 1   disconnectTime      54913 non-null  float64
 2   doneChargingTime    54913 non-null  float64
 3   kWhDelivered        54913 non-null  float64
 4   siteID              54913 non-null  int64  
 5   spaceID             54913 non-null  object 
 6   stationID           54913 non-null  object 
 7   userID_x            38975 non-null  float64
 8   parkDuration        54913 non-null  float64
 9   WhPerMile           38975 non-null  float64
 10  kWhRequested        38975 non-null  float64
 11  milesRequested      38975 non-null  float64
 12  minutesAvailable    38975 non-null  float64
 13  requestedDeparture  38975 non-null  float64
 14  temperature         54913 non-null  float64
 15  cloud_cover         54913 non-null  float64
 16  prec

Lastly we need to convert spaceID and stationID into integer

We will create integers for each spaceID and stationID and use them as categorical data.



In [23]:
#Count all unique values, then map each to an integer, starting at 1 and then mapping this into the df
spaceIDs = df["spaceID"].unique()
spaceIDMap = {value: idx + 1 for idx, value in enumerate(spaceIDs)}
df["spaceID"] = df["spaceID"].map(spaceIDMap)
stationIDs = df["stationID"].unique()
stationIDMap = {value: idx + 1 for idx, value in enumerate(stationIDs)}
df["stationID"] = df["stationID"].map(stationIDMap)
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month
0,0.614433,0.707801,0.705787,10.208,1,1,1,,8067.0,,,,,,17.28,31.94,0.0,0,10
1,0.214525,0.586065,0.398194,11.837,1,1,1,,32101.0,,,,,,18.26,28.74,0.0,1,10
2,0.601852,0.783472,0.76978,12.034,1,1,1,383.0,15692.0,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10
3,0.218773,0.604676,0.414016,11.985,1,1,1,,33342.0,,,,,,17.14,29.36,0.0,2,10
4,0.620093,0.690278,0.674352,7.863,1,1,1,,6064.0,,,,,,17.14,29.36,0.0,2,10


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54913 entries, 0 to 54912
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      54913 non-null  float64
 1   disconnectTime      54913 non-null  float64
 2   doneChargingTime    54913 non-null  float64
 3   kWhDelivered        54913 non-null  float64
 4   siteID              54913 non-null  int64  
 5   spaceID             54913 non-null  int64  
 6   stationID           54913 non-null  int64  
 7   userID_x            38975 non-null  float64
 8   parkDuration        54913 non-null  float64
 9   WhPerMile           38975 non-null  float64
 10  kWhRequested        38975 non-null  float64
 11  milesRequested      38975 non-null  float64
 12  minutesAvailable    38975 non-null  float64
 13  requestedDeparture  38975 non-null  float64
 14  temperature         54913 non-null  float64
 15  cloud_cover         54913 non-null  float64
 16  prec

We only have NaN values in the columns that we extracted from the User Input, which makes sense. Now we split the dataset into two, depending on the site

In [25]:
SiteOne = df[df['siteID'] == 1]
SiteTwo = df[df['siteID'] == 2]

extracting the feature we want to predict and then we split into train, validation and test set. For now only one for transparency reasons

In [None]:
# define x and Y
X = SiteOne.iloc[:,1:19] # include full feature vector
y = SiteOne["hourlyUtil"]


In [None]:
# Conduct train test split
from sklearn.model_selection import train_test_split

X_train, predictors_test, y_train, target_test = train_test_split(X, y, test_size=0.3, random_state=42)

# now split X_train to achive 50-20-30 split
predictors_train, predictors_hold, target_train, target_hold = train_test_split(X_train, y_train, test_size=(0.2/0.7),random_state=34)

ToDO

create feature isWeekDay (need to extract before transforming the time data columns) DONE

transform parkDuration into minutes DONE


                  
create 2 dataframes, one for each site (makes sense economically and seeing the descriptive plots will very likely improve the predictive power) DONE

Rescale the data

perform feature selection (based on the impact of each feature)

split data into train/validation/test sets

throw the training data into the NN and see how it performs

adapt hyperparameters

evaluate model