In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

In [50]:
df = pd.read_csv("data/charging_sessions.csv")
r, c = df.shape
print(f'The dataset has {r} rows and {c} columns')

The dataset has 66450 rows and 13 columns


In [51]:
# Set Datatypes
df['connectionTime'] = pd.to_datetime(df['connectionTime'])
df['disconnectTime'] = pd.to_datetime(df['disconnectTime'])
df['doneChargingTime'] = pd.to_datetime(df['doneChargingTime'])

df['sessionID'] = df['sessionID'].astype(str)
df['siteID'] = df['siteID'].astype(str)
df['spaceID'] = df['spaceID'].astype(str)
df['stationID'] = df['stationID'].astype(str)
#df['userID'] = df['userID'].astype(str) # For site characteristics: better use if datatype is float64

In [52]:
start_date = '2018-04-25'
end_date = '2021-01-01'

df = df[(df['connectionTime'] >= start_date) & (df['disconnectTime'] <= end_date)]
df['weekday'] = df['connectionTime'].dt.day_name()

In [53]:
# Rename first column, contains an index
#df = df.rename(columns={df.columns[0]: 'Index'})
#df = df.set_index('Index')

In [54]:
# Session ID is a composite value of stationID and connectionTime, and therefor redundant
# Timezone contains only one value, and can therefor be considered meta-data
# First column cannot be used as the index because of several mistakes (e.g. "index" 1 would occur 9 times)
# First column makes no sense otherwise
#df[df[df.columns[0]] != df.index] # Shows the differences between actual index and supposed index
#df[df[df.columns[0]] == 1] # Shows, how "index" 1 occurs 9 times
df = df.drop(['sessionID', 'timezone', df.columns[0]], axis=1)

In [55]:
df['NoChargingTime'] = df['disconnectTime'] - df['doneChargingTime']
df['NoChargingTimeMinutes'] = (df['NoChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes
df['NoChargingTimeHours'] = ((df['NoChargingTime']).dt.total_seconds() / 60.0) / 60.0  # Convert minutes to hours

df['ChargingTime'] = df['doneChargingTime'] - df['connectionTime']
df['ChargingTimeMinutes'] = (df['ChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes
df['ChargingTimeHours'] = ((df['ChargingTime']).dt.total_seconds() / 60.0) / 60.0  # Convert minutes to hours

df['kWhPerMinute'] = df['kWhDelivered'] / df['ChargingTimeMinutes']

# Create a new column 'month' to store the month information
df['month'] = df['connectionTime'].dt.month

# Create a new column 'userRegistered' to state if user is registered or not (needed for site characteristics section)
df['userRegistered'] = np.where(df['userID'].isnull(), 'unregistered', 'registered')

### Flatten the Json-Object UserInput

In [56]:
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("'", '"')
# df_charging['userInputs'] = df_charging['userInputs'].str.replace('"[', "'[")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace(']"', "]'")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("True", "true")
# df_charging['userInputs'] = df_charging['userInputs'].str.replace("False", "false")

# Mapping: Value to be replaced: replacing value
#replacements = {
#    "'": '"', 
#    '"[': "'[",
#    ']"': "]'",
#    "True": "true",
#    "False": "false"
# }

# shorthand to replace multiple values
# for i, j in replacements.items():
#    df_charging['userInputs'] = df_charging['userInputs'].str.replace(i, j)
# Old Version
# for i in range(len(df_charging['userInputs'])):
#     if not pd.isna(df_charging['userInputs'].iloc[i]):
#        df_charging.at[i, 'userInputs'] = json.loads(df_charging['userInputs'].iloc[i])

# Refactored Version
# df_charging['userInputs'] = df_charging['userInputs'].apply(lambda x: json.loads(x) if pd.notna(x) else x)

In [57]:
df['userInputs'] = df['userInputs'].str.replace("'", '"')
df['userInputs'] = df['userInputs'].str.replace('"[', "'[")
df['userInputs'] = df['userInputs'].str.replace(']"', "]'")
df['userInputs'] = df['userInputs'].str.replace("True", "true")
df['userInputs'] = df['userInputs'].str.replace("False", "false")

# Old Version
# for i in range(len(df['userInputs'])):
#     if not pd.isna(df['userInputs'].iloc[i]):
#        df.at[i, 'userInputs'] = json.loads(df['userInputs'].iloc[i])

# Refactored Version
df['userInputs'] = df['userInputs'].apply(lambda x: json.loads(x) if pd.notna(x) else x)
# Extract 'paymentRequired' values and save them in a new column
df['user_paymentRequired_values'] = df['userInputs'].apply(lambda x: x[0]['paymentRequired'] if isinstance(x, list) and len(x) > 0 else None)
df['user_userID'] = df['userInputs'].apply(lambda x: x[0]['userID'] if isinstance(x, list) and len(x) > 0 else None)
df['user_requestedDeparture'] = df['userInputs'].apply(lambda x: x[0]['requestedDeparture'] if isinstance(x, list) and len(x) > 0 else None)
df['user_modifiedAt'] = df['userInputs'].apply(lambda x: x[0]['modifiedAt'] if isinstance(x, list) and len(x) > 0 else None)
df['user_minutesAvailable'] = df['userInputs'].apply(lambda x: x[0]['minutesAvailable'] if isinstance(x, list) and len(x) > 0 else None)
df['user_milesRequested'] = df['userInputs'].apply(lambda x: x[0]['milesRequested'] if isinstance(x, list) and len(x) > 0 else None)
df['user_kWhRequested'] = df['userInputs'].apply(lambda x: x[0]['kWhRequested'] if isinstance(x, list) and len(x) > 0 else None)
df['user_WhPerMile'] = df['userInputs'].apply(lambda x: x[0]['WhPerMile'] if isinstance(x, list) and len(x) > 0 else None)

# Drop the 'userInputs' column
df.drop(columns=['userInputs'], inplace=True)

# Check if user_userID and userID are the same (result: they are)
#df['user_userID'] = df['user_userID'].astype(str)
#df[df['userID'] != df['user_userID']]

# Drop the "user_userID" column, as it has the same values as the column "userID", therefore redundant
df = df.drop(['user_userID'], axis=1)

In [58]:
# Add a column 'user_kWhRequestFulfilment', which shows the difference between kWh request and delivery
df['user_kWhRequestFulfilment'] = df['kWhDelivered'] - df['user_kWhRequested']

In [59]:
# Problem detected in KPI section: connectionTime sometimes later than doneChargingTime -> negative values not possible
# Drop rows, where charging time (in hours) has negative values
df = df.drop(df[df['ChargingTimeHours'] < 0].index)

# Problem detected in KPI section: doneChargingTime sometimes later than disconnectTime -> negative values not possible
# Drop rows, where no charging time (in hours) has negative values
df = df.drop(df[df['NoChargingTimeHours'] < 0].index)

In [60]:
# Sorts the values by connection time and updates the index
df = df.sort_values('connectionTime')
df = df.reset_index(drop=True)

In [61]:
start_date = df['connectionTime'].min()
end_date = df['disconnectTime'].max()
datetime_range = pd.date_range(start=start_date, end=end_date, freq='H')

missing_periods = []
current_period = []

for datetime in datetime_range:
    no_entries_for_datetime = df[(df['connectionTime'] <= datetime) & (df['disconnectTime'] >= datetime)].empty

    if no_entries_for_datetime:
        if not current_period or datetime == current_period[-1] + pd.Timedelta(hours=1):
            current_period.append(datetime)
        else:
            if len(current_period) > 7*24:
                missing_periods.append(current_period)
            current_period = [datetime]
    else:
        if current_period and len(current_period) > 7*24:
            missing_periods.append(current_period)
        current_period = []

if current_period and len(current_period) > 7*24:
    missing_periods.append(current_period)

for period in missing_periods:
    print(f"Missing entries from {period[0]} to {period[-1]} (Duration: {len(period)/24} days)")

Missing entries from 2020-08-04 05:45:10+00:00 to 2020-11-18 20:45:10+00:00 (Duration: 106.66666666666667 days)


In [63]:
# import pandas as pd

# # Assuming your DataFrame is named 'df' and has a datetime index
# # 'connectionTime' and 'disconnectTime' are columns containing datetime values
# # 'kWhDelivered' is the column you want to fill

# # Combine 'connectionTime' and 'disconnectTime' to create a datetime index
# df['datetime'] = pd.to_datetime(df['connectionTime'])
# df.set_index('datetime', inplace=True)

# # Ensure the 'kWhDelivered' column has a numeric data type
# df['kWhDelivered'] = pd.to_numeric(df['kWhDelivered'], errors='coerce')

# # Resample the data to have a continuous time series with daily frequency
# df_resampled = df.resample('D').mean()

# # Identify numeric columns for calculating rolling mean
# numeric_columns = df.select_dtypes(include=['number']).columns

# # Calculate the rolling mean with a specified window size (e.g., 7 days) for numeric columns
# rolling_means = df_resampled[numeric_columns].rolling(window=7, min_periods=1).mean()

# # Combine rolling means with non-numeric columns
# df_filled = df.combine_first(rolling_means)

# # Reset the index if needed
# df_filled.reset_index(inplace=True)

# # Print the DataFrame with the filled values
# print(df_filled)


In [64]:
df.to_csv('data/charging_modified.csv')