In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

In [36]:
df = pd.read_csv("cleanData/cleanChargingDataFewNull.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation
0,1149,52872,2018-10-08 15:44:47-07:00,2018-10-08 17:59:14-07:00,2018-10-08 17:56:20-07:00,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,,17.28,31.94,0.0
1,1150,52887,2018-10-09 06:08:55-07:00,2018-10-09 15:03:56-07:00,2018-10-09 10:33:24-07:00,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,,18.26,28.74,0.0
2,1151,52943,2018-10-09 15:26:40-07:00,2018-10-09 19:48:12-07:00,2018-10-09 19:28:29-07:00,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,"Wed, 10 Oct 2018 02:50:40 GMT",18.26,28.74,0.0
3,1152,52959,2018-10-10 06:15:02-07:00,2018-10-10 15:30:44-07:00,2018-10-10 10:56:11-07:00,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,,17.14,29.36,0.0
4,1153,53016,2018-10-10 15:52:56-07:00,2018-10-10 17:34:00-07:00,2018-10-10 17:11:04-07:00,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,,17.14,29.36,0.0


We dont need the columns Unnamed or id. We also need to transform the columns connectionTime, disconnectTime, doneCHargingTime, stationID, parkDuration and requestedDeparture; so our model can work with those features.

In [37]:
df = df.drop("Unnamed: 0", axis=1)
df = df.drop("id", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53474 entries, 0 to 53473
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      53474 non-null  object 
 1   disconnectTime      53474 non-null  object 
 2   doneChargingTime    53474 non-null  object 
 3   kWhDelivered        53474 non-null  float64
 4   siteID              53474 non-null  int64  
 5   spaceID             53474 non-null  object 
 6   stationID           53474 non-null  object 
 7   userID_x            37605 non-null  float64
 8   parkDuration        53474 non-null  object 
 9   WhPerMile           37605 non-null  float64
 10  kWhRequested        37605 non-null  float64
 11  milesRequested      37605 non-null  float64
 12  minutesAvailable    37605 non-null  float64
 13  requestedDeparture  37605 non-null  object 
 14  temperature         53474 non-null  float64
 15  cloud_cover         53474 non-null  float64
 16  prec

We start by transforming the time columns connectionTime, disconnectTime, doneChargingTime, parkDuration and requestedDeparture to datetime/timedelta and then to floats/ints.

In [38]:
df["connectionTime"] = pd.to_datetime(df["connectionTime"], utc=True)
df["disconnectTime"] = pd.to_datetime(df["disconnectTime"], utc=True)
df["doneChargingTime"] = pd.to_datetime(df["doneChargingTime"], utc=True)
df["parkDuration"] = pd.to_timedelta(df["parkDuration"])
df["requestedDeparture"] = pd.to_datetime(df["requestedDeparture"])
dest_timezone = "America/Los_Angeles"
date_columns = ["connectionTime", "disconnectTime", "doneChargingTime"]

for date_column in date_columns:
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    df[date_column] = df[date_column].dt.tz_convert(dest_timezone)

We are creating a new  feature called Weekday, where Monday = 0 and Sunday = 6. It will be the weekday of the connectionTime.


Similarly we are creating a new feature month, where January = 1, Feburary = 2 and so on.

In [39]:
df["weekday"] = df["connectionTime"].dt.weekday
df["month"] = df["connectionTime"].dt.month
df["year"] = df["connectionTime"].dt.year
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month,year
0,2018-10-08 15:44:47-07:00,2018-10-08 17:59:14-07:00,2018-10-08 17:56:20-07:00,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,NaT,17.28,31.94,0.0,0,10,2018
1,2018-10-09 06:08:55-07:00,2018-10-09 15:03:56-07:00,2018-10-09 10:33:24-07:00,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,NaT,18.26,28.74,0.0,1,10,2018
2,2018-10-09 15:26:40-07:00,2018-10-09 19:48:12-07:00,2018-10-09 19:28:29-07:00,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,2018-10-10 02:50:40,18.26,28.74,0.0,1,10,2018
3,2018-10-10 06:15:02-07:00,2018-10-10 15:30:44-07:00,2018-10-10 10:56:11-07:00,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,NaT,17.14,29.36,0.0,2,10,2018
4,2018-10-10 15:52:56-07:00,2018-10-10 17:34:00-07:00,2018-10-10 17:11:04-07:00,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,NaT,17.14,29.36,0.0,2,10,2018


The datetime columns will be convertet to floats that represent the fraction of the day that has passed (for example 6am would be 0.25)

In [40]:
def get_time_day(args):
  time_min = args.minute
  time_min = time_min+args.hour*60
  time_min = time_min+args.second/60
  time_day = time_min/(60*24)
  return time_day


df["connectionTime"] = df["connectionTime"].apply(lambda x: get_time_day(x))
df["disconnectTime"] = df["disconnectTime"].apply(lambda x: get_time_day(x))
df["doneChargingTime"] = df["doneChargingTime"].apply(lambda x: get_time_day(x))
df["requestedDeparture"] = df["requestedDeparture"].apply(lambda x: get_time_day(x))

df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month,year
0,0.6561,0.749468,0.747454,10.208,1,AG-1F01,1-1-193-825,,0 days 02:14:27,,,,,,17.28,31.94,0.0,0,10,2018
1,0.256192,0.627731,0.439861,11.837,1,AG-1F01,1-1-193-825,,0 days 08:55:01,,,,,,18.26,28.74,0.0,1,10,2018
2,0.643519,0.825139,0.811447,12.034,1,AG-1F01,1-1-193-825,383.0,0 days 04:21:32,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10,2018
3,0.26044,0.646343,0.455683,11.985,1,AG-1F01,1-1-193-825,,0 days 09:15:42,,,,,,17.14,29.36,0.0,2,10,2018
4,0.661759,0.731944,0.716019,7.863,1,AG-1F01,1-1-193-825,,0 days 01:41:04,,,,,,17.14,29.36,0.0,2,10,2018


Now we transform parkDuration into minutes

In [41]:
df["parkDuration"] = df["parkDuration"].dt.total_seconds()
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month,year
0,0.6561,0.749468,0.747454,10.208,1,AG-1F01,1-1-193-825,,8067.0,,,,,,17.28,31.94,0.0,0,10,2018
1,0.256192,0.627731,0.439861,11.837,1,AG-1F01,1-1-193-825,,32101.0,,,,,,18.26,28.74,0.0,1,10,2018
2,0.643519,0.825139,0.811447,12.034,1,AG-1F01,1-1-193-825,383.0,15692.0,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10,2018
3,0.26044,0.646343,0.455683,11.985,1,AG-1F01,1-1-193-825,,33342.0,,,,,,17.14,29.36,0.0,2,10,2018
4,0.661759,0.731944,0.716019,7.863,1,AG-1F01,1-1-193-825,,6064.0,,,,,,17.14,29.36,0.0,2,10,2018


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53474 entries, 0 to 53473
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      53474 non-null  float64
 1   disconnectTime      53474 non-null  float64
 2   doneChargingTime    53474 non-null  float64
 3   kWhDelivered        53474 non-null  float64
 4   siteID              53474 non-null  int64  
 5   spaceID             53474 non-null  object 
 6   stationID           53474 non-null  object 
 7   userID_x            37605 non-null  float64
 8   parkDuration        53474 non-null  float64
 9   WhPerMile           37605 non-null  float64
 10  kWhRequested        37605 non-null  float64
 11  milesRequested      37605 non-null  float64
 12  minutesAvailable    37605 non-null  float64
 13  requestedDeparture  37605 non-null  float64
 14  temperature         53474 non-null  float64
 15  cloud_cover         53474 non-null  float64
 16  prec

Lastly we need to convert spaceID and stationID into integer

We will create integers for each spaceID and stationID and use them as categorical data.



In [43]:
#Count all unique values, then map each to an integer, starting at 1 and then mapping this into the df
spaceIDs = df["spaceID"].unique()
spaceIDMap = {value: idx + 1 for idx, value in enumerate(spaceIDs)}
df["spaceID"] = df["spaceID"].map(spaceIDMap)
stationIDs = df["stationID"].unique()
stationIDMap = {value: idx + 1 for idx, value in enumerate(stationIDs)}
df["stationID"] = df["stationID"].map(stationIDMap)
df.head()

Unnamed: 0,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation,weekday,month,year
0,0.6561,0.749468,0.747454,10.208,1,1,1,,8067.0,,,,,,17.28,31.94,0.0,0,10,2018
1,0.256192,0.627731,0.439861,11.837,1,1,1,,32101.0,,,,,,18.26,28.74,0.0,1,10,2018
2,0.643519,0.825139,0.811447,12.034,1,1,1,383.0,15692.0,400.0,15.2,38.0,264.0,0.118519,18.26,28.74,0.0,1,10,2018
3,0.26044,0.646343,0.455683,11.985,1,1,1,,33342.0,,,,,,17.14,29.36,0.0,2,10,2018
4,0.661759,0.731944,0.716019,7.863,1,1,1,,6064.0,,,,,,17.14,29.36,0.0,2,10,2018


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53474 entries, 0 to 53473
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   connectionTime      53474 non-null  float64
 1   disconnectTime      53474 non-null  float64
 2   doneChargingTime    53474 non-null  float64
 3   kWhDelivered        53474 non-null  float64
 4   siteID              53474 non-null  int64  
 5   spaceID             53474 non-null  int64  
 6   stationID           53474 non-null  int64  
 7   userID_x            37605 non-null  float64
 8   parkDuration        53474 non-null  float64
 9   WhPerMile           37605 non-null  float64
 10  kWhRequested        37605 non-null  float64
 11  milesRequested      37605 non-null  float64
 12  minutesAvailable    37605 non-null  float64
 13  requestedDeparture  37605 non-null  float64
 14  temperature         53474 non-null  float64
 15  cloud_cover         53474 non-null  float64
 16  prec

We only have NaN values in the columns that we extracted from the User Input, which makes sense. Now we split the dataset into two, depending on the site

In [45]:
SiteOne = df[df['siteID'] == 1]
SiteTwo = df[df['siteID'] == 2]

extracting the feature we want to predict and then we split into train, validation and test set. For now only one for transparency reasons

In [92]:
# define x and Y
X = SiteOne.iloc[:,1:19] # include full feature vector
y = SiteOne["hourlyUtil"]


KeyError: 'hourlyUtil'

In [2]:
# Conduct train test split
from sklearn.model_selection import train_test_split

X_train, predictors_test, y_train, target_test = train_test_split(X, y, test_size=0.3, random_state=42)

# now split X_train to achive 50-20-30 split
predictors_train, predictors_hold, target_train, target_hold = train_test_split(X_train, y_train, test_size=(0.2/0.7),random_state=34)

ModuleNotFoundError: No module named 'sklearn'

ToDO

create feature isWeekDay (need to extract before transforming the time data columns) DONE

transform parkDuration into minutes DONE


                  
create 2 dataframes, one for each site (makes sense economically and seeing the descriptive plots will very likely improve the predictive power) DONE

Rescale the data

perform feature selection (based on the impact of each feature)

split data into train/validation/test sets

throw the training data into the NN and see how it performs

adapt hyperparameters

evaluate model

In [3]:
import pandas as pd
from datetime import timedelta
data = pd.read_csv("cleanData/cleanChargingDataNoNull.csv")
data["connectionTime"] = pd.to_datetime(data["connectionTime"])
data["disconnectTime"] = pd.to_datetime(data["disconnectTime"])
data["HourlyUtil"] = data["id"]

  data["connectionTime"] = pd.to_datetime(data["connectionTime"])
  data["disconnectTime"] = pd.to_datetime(data["disconnectTime"])


In [4]:
first_row = data.iloc[0]

In [5]:
# Function to disaggregate sessions by hour
def disaggregate_session(row):
    # Generate a list of hourly intervals
    current = row['connectionTime']
    end = row['disconnectTime']
    rows = []
    while current < end:
        # Ensure that intervals stay within the session's bounds
        if current == current.ceil("h"):
            next_hour = min(current + timedelta(hours=1), end)
        else:
            next_hour = min(current.ceil("h"), end)
        rows.append({'id': row['id'], 
                     'connectionTime': current, 
                     'disconnectTime': next_hour})
        current = next_hour
    return rows

In [6]:
dis_rows = disaggregate_session(first_row)
dis_rows = pd.DataFrame(dis_rows)
dis_rows

Unnamed: 0,id,connectionTime,disconnectTime
0,52943,2018-10-09 15:26:40-07:00,2018-10-09 16:00:00-07:00
1,52943,2018-10-09 16:00:00-07:00,2018-10-09 17:00:00-07:00
2,52943,2018-10-09 17:00:00-07:00,2018-10-09 18:00:00-07:00
3,52943,2018-10-09 18:00:00-07:00,2018-10-09 19:00:00-07:00
4,52943,2018-10-09 19:00:00-07:00,2018-10-09 19:48:12-07:00


In [7]:
hourUtil = pd.DataFrame([["2018-10-09 14:00"], ["2018-10-09 15:00"], ["2018-10-09 16:00"], ["2018-10-09 17:00"], ["2018-10-09 18:00"]], columns=["Hour"])
hourUtil["Hour"] = pd.to_datetime(hourUtil["Hour"])
hourUtil["Util"] = [0, 0, 0, 0, 0]

hourUtil["Hour"][0]

Timestamp('2018-10-09 14:00:00')

In [8]:
dis_rows["connectionTime"].tz_localize
dis_rows["connectionTime"][0]

Timestamp('2018-10-09 15:26:40-0700', tz='UTC-07:00')

In [74]:
# Function to aggregate HourlyUtil
def aggregateUtil(data):
    for i in range(len(hourUtil["Hour"])):
        for h in range(len(data["connectionTime"])):
            if(data["connectionTime"][h] <= hourUtil["Hour"][i]):
                hourUtil["Util"][i] += 1


In [75]:
aggregate = aggregateUtil(dis_rows)
aggregate

TypeError: Cannot compare tz-naive and tz-aware timestamps