In [31]:
# Imports 
import pandas as pd
from datetime import datetime
from datetime import timedelta
import numpy as np

In [32]:
# Read in the occupancy csv file for each room
bathroom_occupancy = pd.read_csv('occupancy/bathroom-occupancy_labels.csv', delimiter=';', skiprows=[1])
bedroom_child_occupancy = pd.read_csv('occupancy/bedroom_child-occupancy_labels.csv', delimiter=';', skiprows=[1])
bedroom_parents_occupancy = pd.read_csv('occupancy/bedroom_parents-occupancy_labels.csv', delimiter=';', skiprows=[1])
hallway_occupancy = pd.read_csv('occupancy/hallway-occupancy_labels.csv', delimiter=';', skiprows=[1])
office_occupancy = pd.read_csv('occupancy/office-occupancy_labels.csv', delimiter=';', skiprows=[1])

# Place in one big dict
occupancy_dataframes = {
    'bathroom_occupancy': bathroom_occupancy,
    'bedroom_child_occupancy': bedroom_child_occupancy,
    'bedroom_parents_occupancy': bedroom_parents_occupancy,
    'hallway_occupancy': hallway_occupancy,
    'office_occupancy': office_occupancy
}

# Print an example sample of the data
bathroom_occupancy.head(6)


Unnamed: 0,date,time,total
0,2022-02-21,11:55:39,0
1,2022-02-21,12:59:57,0
2,2022-02-21,13:05:11,0
3,2022-02-21,13:22:18,0
4,2022-02-21,14:24:21,0
5,2022-02-21,14:27:51,0


In [33]:
# Check for NAN's
print(bathroom_occupancy.isnull().values.any())
print(bedroom_child_occupancy.isnull().values.any())
print(bedroom_parents_occupancy.isnull().values.any())
print(hallway_occupancy.isnull().values.any())
print(office_occupancy.isnull().values.any())


False
False
False
False
False


In [34]:
# Function to upsample and bin the timeseries data
def bin_occupancy_data(raw_data, bin_size):
    # First sort the occupancy data by date and time
    raw_data.sort_values(by=["date", "time"], ascending=[False, False])

    # Add a datetime column and make it the index
    raw_data['datetime'] = raw_data['date'] + ' ' + raw_data['time']
    raw_data['datetime'] = pd.to_datetime(raw_data['datetime'], format = '%Y-%m-%d %H:%M:%S')
    raw_data.set_index('datetime', drop=False, inplace=True)

    # Upsample the time series
    binned_data = raw_data.resample(bin_size)

    # Set for each bin the total persons in the room to the latest measurement
    # Fill the NAN's in the bins with the values of the last valid bin
    binned_data = binned_data.agg({'total':'last'}).ffill()

    # Since we only care if the room is used, the amount of persons is irrelevant
    binned_data.loc[(binned_data['total'] > 1), 'total'] = 1

    # Repair the date and time column since the ffill made these values incorrect
    binned_data['datetime'] = binned_data.index
    binned_data['time'] = binned_data['datetime'].dt.time
    binned_data['date'] = binned_data['datetime'].dt.date

    return binned_data

In [35]:
# For each dataframe
for dataframe_name, dataframe in occupancy_dataframes.items():
    # Bin the data
    binned_data = bin_occupancy_data(dataframe, '10Min')

    # Save the bins as csv's
    binned_data.to_csv(f'binned_occupancy/{dataframe_name}_binned.csv', index=False)

In [36]:
# For each dataframe
for dataframe_name, dataframe in occupancy_dataframes.items():
    # Bin the data
    binned_data = bin_occupancy_data(dataframe, '10Min')

    # Group the data by time bin and take the sum of the total values
    binned_and_summed_data = binned_data.groupby('time')['total'].sum()

    # Save the sums as csv's
    binned_and_summed_data.to_csv(f'binned_and_summed_occupancy/{dataframe_name}_binned_and_summed.csv', index=True)