In [50]:
from pathlib import Path
import pickle
import zipfile

import numpy as np
import pandas as pd

In [51]:
dataset_dir = '../datasets/bts_site_b_train/'

dataset_zip = 'train.zip'
dataset_path = Path(dataset_dir) / dataset_zip

mapping_csv = 'mapper_TrainOnly.csv'
mapping_path = Path(dataset_dir) / mapping_csv

In [52]:
# Load the mapping file
mapping_df = pd.read_csv(mapping_path, index_col=0)

# Building B only
# mapping_df = mapping_df[mapping_df['Building'] == 'B']

# Ignore streams not saved to file
mapping_df = mapping_df[mapping_df['Filename'].str.contains('FILE NOT SAVED') == False]

mapping_df.head()

Unnamed: 0,Building,StreamID,Filename,strBrickLabel
0,A,9ba955fa_5960_4c9b_b73a_10156da7d083,trainAll_0.pkl,Operating_Mode_Status
2,A,8fd6e75b_88bc_4992_b420_77389969b3c4,trainAll_1.pkl,Mode_Command
3,A,8db6eaa9_bd6c_4f7e_aed0_a47e4e192a6c,trainAll_2.pkl,Active_Power_Sensor
4,A,b2338dec_110a_45cc_8358_1171aaef2c45,trainAll_3.pkl,System_Status
5,A,ec5ff874_0af2_49d8_a6a0_21ea3d077dc8,trainAll_4.pkl,Maintenance_Mode_Command


In [54]:
streams = []

with zipfile.ZipFile(dataset_path, "r") as f:
    for name in f.namelist():
        if not name.endswith('.pkl'):
            continue

        pkl_data = f.read(name)
        data = pickle.loads(pkl_data)

        filename = name.split('/')[-1]

        building = mapping_df.loc[mapping_df['Filename'] == filename, 'Building'].iloc[0]
        stream_id = mapping_df.loc[mapping_df['Filename'] == filename, 'StreamID'].iloc[0]
        brick_class = data['y']
        data_points = len(data['t'])
        min_time = min(data['t'])
        max_time = max(data['t'])
        min_value = min(data['v'])
        mean_value = np.mean(data['v'])
        median_value = np.median(data['v'])
        max_value = max(data['v'])

        # print('Building:', building, 'StreamID:', stream_id, 'Brick Class:', brick_class, 'Min Time:', min_time, 'Max Time:', max_time, 'Min Value:', min_value, 'Max Value:', max_value)
        # break

        streams.append({
            'filename': filename,
            'building': building,
            'stream_id': stream_id,
            'brick_class': brick_class,
            'data_points': data_points,
            'min_time': min_time,
            'max_time': max_time,
            'min_value': min_value,
            'mean_value': mean_value,
            'median_value': median_value,
            'max_value': max_value,
        })

  data = pickle.loads(pkl_data)


In [55]:
summary_df = pd.DataFrame(streams)
summary_df.head()

Unnamed: 0,filename,building,stream_id,brick_class,data_points,min_time,max_time,min_value,mean_value,median_value,max_value
0,trainAll_2183.pkl,A,980d7c24_3626_47f0_9430_582c827ee507,Alarm,12939,2021-01-01 00:09:56.621,2021-03-31 22:51:21.721,0.0,0.0,0.0,0.0
1,trainAll_5834.pkl,A,3b9ab701_ec53_491f_a943_d82b3e90f130,Mode_Status,12902,2021-01-01 00:02:30.508,2021-03-31 22:54:07.521,0.0,0.203069,0.0,1.0
2,trainAll_6483.pkl,A,22fd94c6_f39f_49b1_b3af_36b4dffd90a9,Fan_Status,12904,2021-01-01 00:02:45.031,2021-03-31 22:54:18.431,0.0,0.288205,0.0,1.0
3,trainAll_7945.pkl,A,1e352003_b4c7_43f0_8f8b_099d44897fe6,Status,12893,2021-01-01 00:03:01.414,2021-03-31 22:54:35.893,0.0,0.581478,1.0,1.0
4,trainAll_4294.pkl,A,7260e1bc_3b09_4880_8c68_57b50b4f0f93,Alarm,12922,2021-01-01 00:00:18.577,2021-03-31 22:51:45.823,0.0,0.0,0.0,0.0


In [56]:
summary_df.sort_values(by=['building', 'brick_class', 'filename'], inplace=True)
summary_df.to_csv('bts_site_b_train_summary.csv', index=False)