In [52]:
# Import the DataClient class from acnportal
# from acnportal import acndata

In [53]:
# Create a new client
# client = acndata.DataClient("${API_KEY}")

In [54]:
# Get the data for a single site
# sites = ["caltech", "jpl", "office001"]
# data: dict[str, list] = {
#    site:[] for site in sites
# }
# for site in sites:
#     # Timeseries means that each entry will contain a plotSignal object with the charging data in seconds intervals, i.e., seconds between each entry
#     generator = client.get_sessions(site, timeseries=False)
#     for value in enumerate(generator):
#         data[site].append(value)

The API mentioned above fetches data quite slowly by default, as it retrieves 25 entries and subsequently follows with the next 25 entries. This process is not efficient. As a workaround, we opted to use the web interface (https://ev.caltech.edu/dataset) to download the data and then implemented the following code to read it efficiently.

In [55]:
import pathlib
import json
import pandas as pd

In [56]:
# Loads all json files in the acn_data folder into single dataframes
acn_data_path = pathlib.Path("data/acn_data")
files = acn_data_path.glob("*.json")

dfs: list[pd.DataFrame] = []
for file in files:
    with open(file, "r") as f:
        data = json.load(f)
        df = pd.DataFrame(data["_items"])
        dfs.append(df)
        
# Concatenate the dataframes
df = pd.concat(dfs)

In [57]:
# Print columns information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34492 entries, 0 to 16154
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   _id               34492 non-null  object 
 1   clusterID         34492 non-null  object 
 2   connectionTime    34492 non-null  object 
 3   disconnectTime    34492 non-null  object 
 4   doneChargingTime  33202 non-null  object 
 5   kWhDelivered      34492 non-null  float64
 6   sessionID         34492 non-null  object 
 7   siteID            34492 non-null  object 
 8   spaceID           34492 non-null  object 
 9   stationID         34492 non-null  object 
 10  timezone          34492 non-null  object 
 11  userID            19652 non-null  object 
 12  userInputs        19652 non-null  object 
dtypes: float64(1), object(12)
memory usage: 3.7+ MB


In [58]:
# Fix column types
df = df.astype({
    "clusterID": "string",
    "connectionTime": "datetime64[ns]",
    "disconnectTime": "datetime64[ns]",
    "doneChargingTime": "datetime64[ns]",
    "kWhDelivered": "float64",
    "sessionID": "string",
    "siteID": "string",
    "spaceID": "string",
    "stationID": "string",
    "timezone": "string",
    "userID": "string",
    "userInputs": "string"
})

In [59]:
# Print head
df.head()

Unnamed: 0,_id,clusterID,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,5bc90cb9f9af8b0d7fe77cd2,39,2018-04-25 11:08:04,2018-04-25 13:20:10,2018-04-25 13:21:10,7.932,2_39_78_362_2018-04-25 11:08:04.400812,2,CA-496,2-39-78-362,America/Los_Angeles,,
1,5bc90cb9f9af8b0d7fe77cd3,39,2018-04-25 13:45:10,2018-04-26 00:56:16,2018-04-25 16:44:15,10.013,2_39_95_27_2018-04-25 13:45:09.617470,2,CA-319,2-39-95-27,America/Los_Angeles,,
2,5bc90cb9f9af8b0d7fe77cd4,39,2018-04-25 13:45:50,2018-04-25 23:04:45,2018-04-25 14:51:44,5.257,2_39_79_380_2018-04-25 13:45:49.962001,2,CA-489,2-39-79-380,America/Los_Angeles,,
3,5bc90cb9f9af8b0d7fe77cd5,39,2018-04-25 14:37:06,2018-04-25 23:55:34,2018-04-25 16:05:22,5.177,2_39_79_379_2018-04-25 14:37:06.460772,2,CA-327,2-39-79-379,America/Los_Angeles,,
4,5bc90cb9f9af8b0d7fe77cd6,39,2018-04-25 14:40:34,2018-04-25 23:03:12,2018-04-25 17:40:30,10.119,2_39_79_381_2018-04-25 14:40:33.638896,2,CA-490,2-39-79-381,America/Los_Angeles,,


In [60]:
# Sort the data by the start time of the session
df = df.sort_values(by="connectionTime")

In [61]:
# Print head
df.head()

Unnamed: 0,_id,clusterID,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,5bc90cb9f9af8b0d7fe77cd2,39,2018-04-25 11:08:04,2018-04-25 13:20:10,2018-04-25 13:21:10,7.932,2_39_78_362_2018-04-25 11:08:04.400812,2,CA-496,2-39-78-362,America/Los_Angeles,,
1,5bc90cb9f9af8b0d7fe77cd3,39,2018-04-25 13:45:10,2018-04-26 00:56:16,2018-04-25 16:44:15,10.013,2_39_95_27_2018-04-25 13:45:09.617470,2,CA-319,2-39-95-27,America/Los_Angeles,,
2,5bc90cb9f9af8b0d7fe77cd4,39,2018-04-25 13:45:50,2018-04-25 23:04:45,2018-04-25 14:51:44,5.257,2_39_79_380_2018-04-25 13:45:49.962001,2,CA-489,2-39-79-380,America/Los_Angeles,,
3,5bc90cb9f9af8b0d7fe77cd5,39,2018-04-25 14:37:06,2018-04-25 23:55:34,2018-04-25 16:05:22,5.177,2_39_79_379_2018-04-25 14:37:06.460772,2,CA-327,2-39-79-379,America/Los_Angeles,,
4,5bc90cb9f9af8b0d7fe77cd6,39,2018-04-25 14:40:34,2018-04-25 23:03:12,2018-04-25 17:40:30,10.119,2_39_79_381_2018-04-25 14:40:33.638896,2,CA-490,2-39-79-381,America/Los_Angeles,,


In [62]:
# Check for missing values in terms of no entries for specific days
# Step 1: Determine the Date Range
first_date = df['connectionTime'].min()
last_date = df['disconnectTime'].max()

# Step 2: Generate a Complete Date Range
complete_date_range = pd.date_range(start=first_date, end=last_date, freq='D')

# Step 3: Identify Missing Time Periods
missing_periods = []
current_period = []

for date in complete_date_range:
    # Check if there are no entries for the current date
    no_entries_for_date = df[(df['connectionTime'] <= date) & (df['disconnectTime'] >= date)].empty
    
    if no_entries_for_date:
        if not current_period or date == current_period[-1] + pd.Timedelta(days=1):
            current_period.append(date)
        else:
            missing_periods.append(current_period)
            current_period = [date]
    else:
        # Entries found, start a new period
        if current_period:
            missing_periods.append(current_period)
            current_period = []

# Append the last period if it exists
if current_period:
    missing_periods.append(current_period)

# Step 4: Print Longer Periods (more than 1 week)
for period in missing_periods:
    start_date, end_date = period[0], period[-1]
    duration = (end_date - start_date).days + 1  # Add 1 to include both start and end dates
    if duration > 7:
        print(f"Missing entries from {start_date} to {end_date} (Duration: {duration} days)")

Missing entries from 2019-04-18 11:08:04 to 2019-04-29 11:08:04 (Duration: 12 days)
Missing entries from 2019-05-01 11:08:04 to 2019-05-08 11:08:04 (Duration: 8 days)
Missing entries from 2019-05-22 11:08:04 to 2019-06-12 11:08:04 (Duration: 22 days)
Missing entries from 2019-06-16 11:08:04 to 2019-07-03 11:08:04 (Duration: 18 days)
Missing entries from 2019-07-29 11:08:04 to 2019-08-06 11:08:04 (Duration: 9 days)


In [63]:
# Save the data to a csv file
df.to_csv("data/acn_data.csv", index=True)