In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

## This part of the code is used to partition the datasets of each floor

In [None]:
set_zones = {
    'Floor1': 4,
    'Floor2': 4,
    'Floor3': 5,
    'Floor4': 5,
    'Floor5': 5,
    'Floor6': 5,
    'Floor7': 5
}

In [None]:
# This function returns the set of continuous frames of data, such that each frame extends to more than an hour
def continuous_section_frame(data):
    data = data.reset_index()
    inds = np.where(data.Date.diff() != np.timedelta64(1, 'm'))[0]
    data = data.set_index('Date')
    cont_sections = list()
    if len(inds) > 0:
        for i, ind in enumerate(inds[:-1]):
            aux = data.iloc[ind:inds[i+1]]
            if len(aux) >= 60:
                cont_sections.append(aux)
        aux = data.iloc[inds[-1]:]
        if len(aux) >= 60:
            cont_sections.append(aux)

    return cont_sections

def retrieve_dataframe(cont_sections):
    all_df = pd.DataFrame()
    for idx, section in enumerate(cont_sections):
        frame_name = f"frame_{idx}"
        section['frame_id'] = frame_name
        all_df = pd.concat([all_df, section])
    return all_df

def get_zone_features(zone_nb, df_columns):
    return [col for col in df_columns if zone_nb in col]

In [None]:
from tqdm.notebook import trange, tqdm

# This code generates the data of each zone, appended with information about the corresponding time frame
for filename in tqdm(os.listdir("../datasets")):
    if filename.endswith(".csv") and filename != "Thailand_Holidays.csv":
        df = pd.read_csv(f"../datasets/{filename}", index_col=[0])
        floor_nb = filename.split(".")[0][-1]
        date_name = filename[:4]
        name_floor = f"Floor{floor_nb}"
        nb_zones = set_zones[name_floor]
        for i in range(1, nb_zones+1):
            zone_features = get_zone_features(f"z{i}", df.columns)
            df_zone = df[zone_features]
            df_zone.dropna(inplace=True)
            df_zone.index = pd.to_datetime(df_zone.index)
            cont_sections = continuous_section_frame(df_zone)
            if len(cont_sections) > 0:
                all_df = retrieve_dataframe(cont_sections)
                print(all_df.index[-1] - all_df.index[0], all_df.shape, f"{date_name}_z{i}_{name_floor}.csv")
                all_df.to_csv(f"../datasets/generated/{date_name}_z{i}_{name_floor}.csv")