In [140]:
from acnportal import acndata
import pandas as pd

In [141]:
# Create a new client
client = acndata.DataClient("Ob492t97XY9JjjmqMntMqhkObiDL8tushw4BoCBzdNo")

In [142]:
# Get the data for a single site
sites = ["caltech", "jpl", "office001"]
data: dict[str, list] = {
    site:[] for site in sites
}
for site in sites:
     generator = client.get_sessions(site, timeseries=False)
     data[site] = list(generator)

In [143]:
# Display the first 5 entries for each site
for site in sites:
    print(f"First 2 entries for {site}")
    print(data[site][:2])

First 2 entries for caltech
[{'_id': '5bc90cb9f9af8b0d7fe77cd2', 'userInputs': None, 'sessionID': '2_39_78_362_2018-04-25 11:08:04.400812', 'stationID': '2-39-78-362', 'spaceID': 'CA-496', 'siteID': '0002', 'clusterID': '0039', 'connectionTime': datetime.datetime(2018, 4, 25, 4, 8, 4, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), 'disconnectTime': datetime.datetime(2018, 4, 25, 6, 20, 10, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), 'kWhDelivered': 7.932, 'doneChargingTime': datetime.datetime(2018, 4, 25, 6, 21, 10, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), 'timezone': 'America/Los_Angeles', 'userID': None}, {'_id': '5bc90cb9f9af8b0d7fe77cd3', 'userInputs': None, 'sessionID': '2_39_95_27_2018-04-25 13:45:09.617470', 'stationID': '2-39-95-27', 'spaceID': 'CA-319', 'siteID': '0002', 'clusterID': '0039', 'connectionTime': datetime.datetime(2018, 4, 25, 6, 45, 10, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00

In [144]:
items = []
sites_of_interest = ["caltech", "jpl"]
for site in sites_of_interest:
    df = pd.DataFrame(data[site])
    items.append(df)
df = pd.concat(items)
r, c = df.shape
print(f'The dataset has {r} rows and {c} columns')

The dataset has 65062 rows and 13 columns


In [145]:
# Fix column types
df = df.astype({
    "clusterID": "string",
    "kWhDelivered": "float64",
    "sessionID": "string",
    "siteID": "string",
    "spaceID": "string",
    "stationID": "string",
    "timezone": "string",
    "userID": "string",
    "userInputs": "string"
})
df.sort_values(by="connectionTime")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65062 entries, 0 to 33637
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype                              
---  ------            --------------  -----                              
 0   _id               65062 non-null  object                             
 1   userInputs        47847 non-null  string                             
 2   sessionID         65062 non-null  string                             
 3   stationID         65062 non-null  string                             
 4   spaceID           65062 non-null  string                             
 5   siteID            65062 non-null  string                             
 6   clusterID         65062 non-null  string                             
 7   connectionTime    65062 non-null  datetime64[ns, America/Los_Angeles]
 8   disconnectTime    65062 non-null  datetime64[ns, America/Los_Angeles]
 9   kWhDelivered      65062 non-null  float64                         

In [146]:
start_date = df['connectionTime'].min()
end_date = df['disconnectTime'].max()
datetime_range = pd.date_range(start=start_date, end=end_date, freq='H')

missing_periods = []
current_period = []

for datetime in datetime_range:
    no_entries_for_datetime = df[(df['connectionTime'] <= datetime) & (df['disconnectTime'] >= datetime)].empty

    if no_entries_for_datetime:
        if not current_period or datetime == current_period[-1] + pd.Timedelta(hours=1):
            current_period.append(datetime)
        else:
            if len(current_period) > 7*24:
                missing_periods.append(current_period)
            current_period = [datetime]
    else:
        if current_period and len(current_period) > 7*24:
            missing_periods.append(current_period)
        current_period = []

if current_period and len(current_period) > 7*24:
    missing_periods.append(current_period)

for period in missing_periods:
    print(f"Missing entries from {period[0]} to {period[-1]} (Duration: {len(period)/24} days)")

Missing entries from 2020-08-03 22:08:04-07:00 to 2020-11-18 12:08:04-08:00 (Duration: 106.66666666666667 days)


In [147]:
# Save the data to a csv file
df.to_csv("data/acn_data.csv", index=True)

The API mentioned above fetches data quite slowly by default, as it retrieves 100 entries and subsequently follows with the next 100 entries. This process is not efficient. As a workaround, we opted to use the web interface (https://ev.caltech.edu/dataset) to download the data and then implemented the following code to read it efficiently. **Note**: The data was downloaded in the JSON format and saved in the `data/acn_data` folder.

import pathlib
import json
import pandas as pd

# Loads all json files in the acn_data folder into single dataframes
acn_data_path = pathlib.Path("data/acn_data")
files = acn_data_path.glob("*.json")

dfs: list[pd.DataFrame] = []
for file in files:
    with open(file, "r") as f:
        data = json.load(f)
        df = pd.DataFrame(data["_items"])
        r, c = df.shape
        print(f'The dataset {file} has {r} rows')
        dfs.append(df)
        
# Concatenate the dataframes
df = pd.concat(dfs)
r, c = df.shape
print(f'The dataset has {r} rows and {c} columns')

# Print columns information
df.info()

# Fix column types
df = df.astype({
    "clusterID": "string",
    "connectionTime": "datetime64[ns]",
    "disconnectTime": "datetime64[ns]",
    "doneChargingTime": "datetime64[ns]",
    "kWhDelivered": "float64",
    "sessionID": "string",
    "siteID": "string",
    "spaceID": "string",
    "stationID": "string",
    "timezone": "string",
    "userID": "string",
    "userInputs": "string"
})

# Print head
df.head()

# Sort the data by the start time of the session
df = df.sort_values(by="connectionTime")

# Print head
df.head()

r, c = df.shape
print(f'The dataset has {r} rows and {c} columns')

# Save the data to a csv file
df.to_csv("data/acn_data.csv", index=True)