In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

In [30]:
df = pd.read_csv("data/charging_sessions.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."


In [31]:
# Set Datatypes
df['connectionTime'] = pd.to_datetime(df['connectionTime'])
df['disconnectTime'] = pd.to_datetime(df['disconnectTime'])
df['doneChargingTime'] = pd.to_datetime(df['doneChargingTime'])

df['sessionID'] = df['sessionID'].astype(str)
df['siteID'] = df['siteID'].astype(str)
df['spaceID'] = df['spaceID'].astype(str)
df['stationID'] = df['stationID'].astype(str)
#df['userID'] = df['userID'].astype(str) # For site characteristics: better use if datatype is float64

In [32]:
start_date = '2018-04-25'
end_date = '2021-01-01'

df = df[(df['connectionTime'] >= start_date) & (df['disconnectTime'] <= end_date)]
df['weekday'] = df['connectionTime'].dt.day_name()

In [33]:
# Rename first column, contains an index
#df = df.rename(columns={df.columns[0]: 'Index'})
#df = df.set_index('Index')

In [34]:
# Session ID is a composite value of stationID and connectionTime, and therefor redundant
# Timezone contains only one value, and can therefor be considered meta-data
# First column cannot be used as the index because of several mistakes (e.g. "index" 1 would occur 9 times)
# First column makes no sense otherwise
#df[df[df.columns[0]] != df.index] # Shows the differences between actual index and supposed index
#df[df[df.columns[0]] == 1] # Shows, how "index" 1 occurs 9 times
df = df.drop(['sessionID', 'timezone', df.columns[0]], axis=1)

In [35]:
df['NoChargingTime'] = df['disconnectTime'] - df['doneChargingTime']
df['NoChargingTimeMinutes'] = (df['NoChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes
df['NoChargingTimeHours'] = ((df['NoChargingTime']).dt.total_seconds() / 60.0) / 60.0  # Convert minutes to hours

df['ChargingTime'] = df['doneChargingTime'] - df['connectionTime']
df['ChargingTimeMinutes'] = (df['ChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes
df['ChargingTimeHours'] = ((df['ChargingTime']).dt.total_seconds() / 60.0) / 60.0  # Convert minutes to hours

df['kWhPerMinute'] = df['kWhDelivered'] / df['ChargingTimeMinutes']

# Create a new column 'month' to store the month information
df['month'] = df['connectionTime'].dt.month

# Create a new column 'userRegistered' to state if user is registered or not (needed for site characteristics section)
df['userRegistered'] = np.where(df['userID'].isnull(), 'unregistered', 'registered')

### Flatten the Json-Object UserInput

In [36]:
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("'", '"')
# df_charging['userInputs'] = df_charging['userInputs'].str.replace('"[', "'[")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace(']"', "]'")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("True", "true")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("False", "false")

# Mapping: Value to be replaced: replacing value
#replacements = {
#    "'": '"', 
#    '"[': "'[",
#    ']"': "]'",
#    "True": "true",
#    "False": "false"
# }

# shorthand to replace multiple values
# for i, j in replacements.items():
#    df_charging['userInputs'] = df_charging['userInputs'].str.replace(i, j)
# Old Version
# for i in range(len(df_charging['userInputs'])):
#     if not pd.isna(df_charging['userInputs'].iloc[i]):
#        df_charging.at[i, 'userInputs'] = json.loads(df_charging['userInputs'].iloc[i])

# Refactored Version
# df_charging['userInputs'] = df_charging['userInputs'].apply(lambda x: json.loads(x) if pd.notna(x) else x)

In [37]:
df['userInputs'] = df['userInputs'].str.replace("'", '"')
df['userInputs'] = df['userInputs'].str.replace('"[', "'[")
df['userInputs'] = df['userInputs'].str.replace(']"', "]'")
df['userInputs'] = df['userInputs'].str.replace("True", "true")
df['userInputs'] = df['userInputs'].str.replace("False", "false")

# Old Version
# for i in range(len(df['userInputs'])):
#     if not pd.isna(df['userInputs'].iloc[i]):
#        df.at[i, 'userInputs'] = json.loads(df['userInputs'].iloc[i])

# Refactored Version
df['userInputs'] = df['userInputs'].apply(lambda x: json.loads(x) if pd.notna(x) else x)
# Extract 'paymentRequired' values and save them in a new column
df['user_paymentRequired_values'] = df['userInputs'].apply(lambda x: x[0]['paymentRequired'] if isinstance(x, list) and len(x) > 0 else None)
df['user_userID'] = df['userInputs'].apply(lambda x: x[0]['userID'] if isinstance(x, list) and len(x) > 0 else None)
df['user_requestedDeparture'] = df['userInputs'].apply(lambda x: x[0]['requestedDeparture'] if isinstance(x, list) and len(x) > 0 else None)
df['user_modifiedAt'] = df['userInputs'].apply(lambda x: x[0]['modifiedAt'] if isinstance(x, list) and len(x) > 0 else None)
df['user_minutesAvailable'] = df['userInputs'].apply(lambda x: x[0]['minutesAvailable'] if isinstance(x, list) and len(x) > 0 else None)
df['user_milesRequested'] = df['userInputs'].apply(lambda x: x[0]['milesRequested'] if isinstance(x, list) and len(x) > 0 else None)
df['user_kWhRequested'] = df['userInputs'].apply(lambda x: x[0]['kWhRequested'] if isinstance(x, list) and len(x) > 0 else None)
df['user_WhPerMile'] = df['userInputs'].apply(lambda x: x[0]['WhPerMile'] if isinstance(x, list) and len(x) > 0 else None)

# Drop the 'userInputs' column
df.drop(columns=['userInputs'], inplace=True)

# Check if user_userID and userID are the same (result: they are)
#df['user_userID'] = df['user_userID'].astype(str)
#df[df['userID'] != df['user_userID']]

# Drop the "user_userID" column, as it has the same values as the column "userID", therefore redundant
df = df.drop(['user_userID'], axis=1)

In [38]:
# Add a column 'user_kWhRequestFulfilment', which shows the difference between kWh request and delivery
df['user_kWhRequestFulfilment'] = df['kWhDelivered'] - df['user_kWhRequested']

In [39]:
# Problem detected in KPI section: connectionTime sometimes later than doneChargingTime -> negative values not possible
# Drop rows, where charging time (in hours) has negative values
df = df.drop(df[df['ChargingTimeHours'] < 0].index)

# Problem detected in KPI section: doneChargingTime sometimes later than disconnectTime -> negative values not possible
# Drop rows, where no charging time (in hours) has negative values
df = df.drop(df[df['NoChargingTimeHours'] < 0].index)

In [40]:
# Sorts the values by connection time and updates the index
df = df.sort_values('connectionTime')
df = df.reset_index(drop=True)

In [41]:
df.head()

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID,weekday,...,month,userRegistered,user_paymentRequired_values,user_requestedDeparture,user_modifiedAt,user_minutesAvailable,user_milesRequested,user_kWhRequested,user_WhPerMile,user_kWhRequestFulfilment
0,5bc90cb9f9af8b0d7fe77cd3,2018-04-25 13:45:10+00:00,2018-04-26 00:56:16+00:00,2018-04-25 16:44:15+00:00,10.013,2,CA-319,2-39-95-27,,Wednesday,...,4,unregistered,,,,,,,,
1,5bc90cb9f9af8b0d7fe77cd4,2018-04-25 13:45:50+00:00,2018-04-25 23:04:45+00:00,2018-04-25 14:51:44+00:00,5.257,2,CA-489,2-39-79-380,,Wednesday,...,4,unregistered,,,,,,,,
2,5bc90cb9f9af8b0d7fe77cd5,2018-04-25 14:37:06+00:00,2018-04-25 23:55:34+00:00,2018-04-25 16:05:22+00:00,5.177,2,CA-327,2-39-79-379,,Wednesday,...,4,unregistered,,,,,,,,
3,5bc90cb9f9af8b0d7fe77cd6,2018-04-25 14:40:34+00:00,2018-04-25 23:03:12+00:00,2018-04-25 17:40:30+00:00,10.119,2,CA-490,2-39-79-381,,Wednesday,...,4,unregistered,,,,,,,,
4,5bc90cb9f9af8b0d7fe77cd7,2018-04-25 14:43:50+00:00,2018-04-26 01:17:30+00:00,2018-04-25 16:18:28+00:00,7.91,2,CA-303,2-39-139-28,,Wednesday,...,4,unregistered,,,,,,,,


In [42]:

df.to_pickle('data/charging_modified.pkl')
df.to_csv('data/charging_session_flattened_json.csv')