First conceptual idea of splitting each row of the data set by hour for the predictive analytics models (in order to later re-aggregate the data by hour more easily)

In [71]:
import pandas as pd
from datetime import timedelta

In [72]:
data = pd.read_csv("cleanData/cleanChargingDataNoNull.csv")


In [73]:
data["connectionTime"] = pd.to_datetime(data["connectionTime"])
data["disconnectTime"] = pd.to_datetime(data["disconnectTime"])

In [74]:
first_row = data.iloc[0]

In [75]:
# Function to disaggregate sessions by hour
def disaggregate_session(row):
    # Generate a list of hourly intervals
    current = row['connectionTime']
    end = row['disconnectTime']
    rows = []
    while current < end:
        # Ensure that intervals stay within the session's bounds
        if current == current.ceil("h"):
            next_hour = min(current + timedelta(hours=1), end)
        else:
            next_hour = min(current.ceil("h"), end)
        rows.append({'id': row['id'], 
                     'connectionTime': current, 
                     'disconnectTime': next_hour})
        current = next_hour
    return rows

In [76]:
dis_rows = disaggregate_session(first_row)
dis_rows

[{'id': 52943,
  'connectionTime': Timestamp('2018-10-09 14:26:40+0000', tz='UTC'),
  'disconnectTime': Timestamp('2018-10-09 15:00:00+0000', tz='UTC')},
 {'id': 52943,
  'connectionTime': Timestamp('2018-10-09 15:00:00+0000', tz='UTC'),
  'disconnectTime': Timestamp('2018-10-09 16:00:00+0000', tz='UTC')},
 {'id': 52943,
  'connectionTime': Timestamp('2018-10-09 16:00:00+0000', tz='UTC'),
  'disconnectTime': Timestamp('2018-10-09 17:00:00+0000', tz='UTC')},
 {'id': 52943,
  'connectionTime': Timestamp('2018-10-09 17:00:00+0000', tz='UTC'),
  'disconnectTime': Timestamp('2018-10-09 18:00:00+0000', tz='UTC')},
 {'id': 52943,
  'connectionTime': Timestamp('2018-10-09 18:00:00+0000', tz='UTC'),
  'disconnectTime': Timestamp('2018-10-09 18:48:12+0000', tz='UTC')}]