First conceptual idea of splitting each row of the data set by hour for the predictive analytics models (in order to later re-aggregate the data by hour more easily)

In [17]:
import pandas as pd
from datetime import timedelta

In [18]:
rf_data = pd.read_csv("../cleanData/cleanChargingDataFull.csv")

In [19]:
rf_data["connectionTime"] = pd.to_datetime(rf_data["connectionTime"])
rf_data["disconnectTime"] = pd.to_datetime(rf_data["disconnectTime"])

In [20]:
rf_data = rf_data.drop_duplicates(subset=["connectionTime", "spaceID"], ignore_index=True)
rf_data

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,requestedDeparture,temperature,cloud_cover,precipitation
0,0,51323,2020-11-18 15:36:26+00:00,2020-11-18 16:02:37+00:00,,4.816,2,11900388,2-39-81-4550,7132.0,0 days 00:26:11,274.0,8.22,30.0,480.0,"Thu, 19 Nov 2020 07:36:26 GMT",13.15,27.46,0.0
1,1,51324,2020-11-18 16:35:54+00:00,2020-11-18 17:31:08+00:00,,10.027,2,11900388,2-39-81-4550,4903.0,0 days 00:55:14,258.0,51.60,200.0,576.0,"Thu, 19 Nov 2020 10:11:54 GMT",13.15,27.46,0.0
2,2,51325,2020-11-18 17:34:02+00:00,2020-11-18 18:45:14+00:00,,24.486,2,11900388,2-39-81-4550,4903.0,0 days 01:11:12,258.0,51.60,200.0,576.0,"Thu, 19 Nov 2020 11:10:02 GMT",13.15,27.46,0.0
3,3,51328,2020-11-18 19:52:00+00:00,2020-11-18 20:00:50+00:00,,4.788,2,11900388,2-39-81-4550,1085.0,0 days 00:08:50,283.0,56.60,200.0,589.0,"Thu, 19 Nov 2020 13:41:00 GMT",13.15,27.46,0.0
4,4,51329,2020-11-18 20:24:11+00:00,2020-11-18 21:07:15+00:00,,30.849,2,11900388,2-39-81-4550,9284.0,0 days 00:43:04,400.0,40.00,100.0,30.0,"Thu, 19 Nov 2020 04:54:11 GMT",13.15,27.46,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65031,66445,24487,2019-05-13 11:43:12+00:00,2019-05-13 13:54:33+00:00,2019-05-13 12:46:37+00:00,2.308,2,CA-513,2-39-139-567,560.0,0 days 02:11:21,273.0,5.46,20.0,133.0,"Mon, 13 May 2019 21:56:12 GMT",17.19,25.71,0.0
65032,66446,25429,2019-06-13 08:53:42+00:00,2019-06-13 09:35:12+00:00,2019-06-13 09:24:57+00:00,0.908,2,CA-513,2-39-139-567,,0 days 00:41:30,,,,,,18.27,26.95,0.0
65033,66447,28030,2019-09-16 05:55:08+00:00,2019-09-16 07:34:27+00:00,2019-09-16 06:25:44+00:00,0.893,2,CA-513,2-39-139-567,,0 days 01:39:19,,,,,,22.00,29.38,0.0
65034,66448,29515,2019-11-06 07:28:18+00:00,2019-11-06 08:41:56+00:00,2019-11-06 07:58:42+00:00,0.900,2,CA-513,2-39-139-567,,0 days 01:13:38,,,,,,15.88,33.56,0.0


In [10]:
rf1_data = rf_data[rf_data["siteID"] == 1]
rf2_data = rf_data[rf_data["siteID"] == 2]

**Disaggregate sessions by hour**

In [11]:
# Function to disaggregate sessions by hour
def disaggregate_session(row):
    # Generate a list of hourly intervals
    current = row['connectionTime']
    end = row['disconnectTime']
    rows = []
    while current < end:
        # Ensure that intervals stay within the session's bounds
        if current == current.ceil("h"):
            next_hour = min(current + timedelta(hours=1), end)
        else:
            next_hour = min(current.ceil("h"), end)
        rows.append({'id': row['id'], 
                     'connectionTime': row['connectionTime'], 
                     'disconnectTime': row['disconnectTime'],
                     'inHourStartTime': current,
                     'inHourEndTime': next_hour,
                     'minutesInHour' : ((next_hour - current).seconds) / 60})
        current = next_hour
    return rows

In [12]:
dis_rf1_data = rf1_data.apply(disaggregate_session, axis=1)
dis_rf2_data = rf2_data.apply(disaggregate_session, axis=1)

**Re-aggregate data & fill new dataframe**

In [13]:
def agg_rows(p_df, first_year, last_year):
    # Generate a range of hourly timestamps for the entire year
    start_time = f'{first_year}-01-01 00:00:00+00:00'
    end_time = f'{last_year}-12-31 23:00:00+00:00'
    hourly_range = pd.date_range(start=start_time, end=end_time, freq='h')

    # Create a DataFrame with the timestamps
    this_df = pd.DataFrame(hourly_range, columns=['datetime'], index=hourly_range)

    # Add additional columns if needed (e.g., placeholder values)
    this_df['hour'] = this_df['datetime'].map(lambda x: x.hour)
    this_df['weekday'] = this_df['datetime'].map(lambda x: x.weekday())
    this_df['month'] = this_df['datetime'].map(lambda x: x.month)
    this_df['year'] = this_df['datetime'].map(lambda x: x.year)
    this_df['total number of sessions'] = 0
    this_df['total minutes of parking'] = 0.0
    #this_df['total kWh requested'] = 0

    # Fill dataframe
    for session in p_df:
        for cur_row in session:
            # OPtimize method by splitting data set that is searched
            cur_connectionTime = cur_row["inHourStartTime"]
            cur_minutesInHour = cur_row["minutesInHour"]
            cur_floor_connectionTime = cur_connectionTime.floor('h')

            this_df.loc[cur_floor_connectionTime, "total number of sessions"] += 1
            this_df.loc[cur_floor_connectionTime, "total minutes of parking"] += cur_minutesInHour
    
    return this_df

In [14]:
agg_rf1_data = agg_rows(dis_rf1_data, 2018, 2021)
agg_rf2_data = agg_rows(dis_rf2_data, 2018, 2021)

In [None]:
agg_rf1_data.head()

Unnamed: 0,datetime,hour,weekday,month,year,total number of sessions,total minutes of parking
2018-01-01 00:00:00+00:00,2018-01-01 00:00:00+00:00,0,0,1,2018,0,0.0
2018-01-01 01:00:00+00:00,2018-01-01 01:00:00+00:00,1,0,1,2018,0,0.0
2018-01-01 02:00:00+00:00,2018-01-01 02:00:00+00:00,2,0,1,2018,0,0.0
2018-01-01 03:00:00+00:00,2018-01-01 03:00:00+00:00,3,0,1,2018,0,0.0
2018-01-01 04:00:00+00:00,2018-01-01 04:00:00+00:00,4,0,1,2018,0,0.0


: 

**Requirements input data set for task 4:**
- Rows need to be aggregated by hour;
- Following columns are needed
    - ID
    - connectionTime
    - disconnectTime
    - doneChargingTime

    Date information:
    - hour
    - weekday 
    - day of month
    - month
    - year

    Aggregated utilization information (**possible target variables**):
    - hourly parking utilization (sum of total minutes of utilization of every parking space in specific hour)
    - hourly charging utilization (sum of total minutes of utilization of every parking space in specific hour)
    - hourly non-charging utilization = hourly parking utilization - hourly charging utilization
    - total kWh requested (sum of total kWh requested in specific hour; only consider kWh of sessions STARTED in specific hour)
    - total number of sessions

    Weather information:
    - temperature
    - cloud cover 
    - precipitation