In [10]:
from pathlib import Path
import pickle
import zipfile

import numpy as np
import pandas as pd

import rdflib
from rdflib import Graph
from rdflib.namespace import BRICK

In [11]:
dataset_dir = '../datasets/bts_site_b_train/'

dataset_zip = 'train.zip'
dataset_path = Path(dataset_dir) / dataset_zip

mapping_csv = 'mapper_TrainOnly.csv'
mapping_path = Path(dataset_dir) / mapping_csv

building_ttl = 'Site_B.ttl'
building_model = Path(dataset_dir) / building_ttl

brick_ttl = 'Brick_v1.2.1.ttl'
brick_schema = Path(dataset_dir) / brick_ttl

In [12]:
# Load the mapping file
mapping_df = pd.read_csv(mapping_path, index_col=0)

# Building B only
# mapping_df = mapping_df[mapping_df['Building'] == 'B']

# Ignore streams not saved to file
mapping_df = mapping_df[mapping_df['Filename'].str.contains('FILE NOT SAVED') == False]

mapping_df.head()

Unnamed: 0,Building,StreamID,Filename,strBrickLabel
0,A,9ba955fa_5960_4c9b_b73a_10156da7d083,trainAll_0.pkl,Operating_Mode_Status
2,A,8fd6e75b_88bc_4992_b420_77389969b3c4,trainAll_1.pkl,Mode_Command
3,A,8db6eaa9_bd6c_4f7e_aed0_a47e4e192a6c,trainAll_2.pkl,Active_Power_Sensor
4,A,b2338dec_110a_45cc_8358_1171aaef2c45,trainAll_3.pkl,System_Status
5,A,ec5ff874_0af2_49d8_a6a0_21ea3d077dc8,trainAll_4.pkl,Maintenance_Mode_Command


In [13]:
streams = []

with zipfile.ZipFile(dataset_path, "r") as f:
    for name in f.namelist():
        if not name.endswith('.pkl'):
            continue

        pkl_data = f.read(name)
        data = pickle.loads(pkl_data)

        filename = name.split('/')[-1]

        building = mapping_df.loc[mapping_df['Filename'] == filename, 'Building'].iloc[0]
        stream_id = mapping_df.loc[mapping_df['Filename'] == filename, 'StreamID'].iloc[0]
        brick_class_csv = mapping_df.loc[mapping_df['Filename'] == filename, 'strBrickLabel'].iloc[0]
        brick_class_pkl = data['y']
        data_points = len(data['t'])
        min_time = min(data['t'])
        max_time = max(data['t'])
        min_value = min(data['v'])
        mean_value = np.mean(data['v'])
        median_value = np.median(data['v'])
        max_value = max(data['v'])

        # print('Building:', building, 'StreamID:', stream_id, 'Brick Class:', brick_class, 'Min Time:', min_time, 'Max Time:', max_time, 'Min Value:', min_value, 'Max Value:', max_value)
        # break

        streams.append({
            'filename': filename,
            'building': building,
            'stream_id': stream_id,
            'brick_class_pkl': brick_class_pkl,
            'brick_class_csv': brick_class_csv,
            'data_points': data_points,
            'min_time': min_time,
            'max_time': max_time,
            'min_value': min_value,
            'mean_value': mean_value,
            'median_value': median_value,
            'max_value': max_value,
        })

  data = pickle.loads(pkl_data)


In [14]:
summary_df = pd.DataFrame(streams)
summary_df.head()

Unnamed: 0,filename,building,stream_id,brick_class_pkl,brick_class_csv,data_points,min_time,max_time,min_value,mean_value,median_value,max_value
0,trainAll_2183.pkl,A,980d7c24_3626_47f0_9430_582c827ee507,Alarm,Alarm,12939,2021-01-01 00:09:56.621,2021-03-31 22:51:21.721,0.0,0.0,0.0,0.0
1,trainAll_5834.pkl,A,3b9ab701_ec53_491f_a943_d82b3e90f130,Mode_Status,Mode_Status,12902,2021-01-01 00:02:30.508,2021-03-31 22:54:07.521,0.0,0.203069,0.0,1.0
2,trainAll_6483.pkl,A,22fd94c6_f39f_49b1_b3af_36b4dffd90a9,Fan_Status,Fan_Status,12904,2021-01-01 00:02:45.031,2021-03-31 22:54:18.431,0.0,0.288205,0.0,1.0
3,trainAll_7945.pkl,A,1e352003_b4c7_43f0_8f8b_099d44897fe6,Status,Status,12893,2021-01-01 00:03:01.414,2021-03-31 22:54:35.893,0.0,0.581478,1.0,1.0
4,trainAll_4294.pkl,A,7260e1bc_3b09_4880_8c68_57b50b4f0f93,Alarm,Alarm,12922,2021-01-01 00:00:18.577,2021-03-31 22:51:45.823,0.0,0.0,0.0,0.0


In [15]:
g = Graph().parse(building_model, format='turtle')

In [28]:
def get_brick_class(stream_id, graph):
    predicate = rdflib.term.URIRef('http://senaps.io/schema/1.0/senaps#stream_id')
    object = rdflib.term.Literal(stream_id)
    subject = g.value(predicate=predicate, object=object, any=True)

    predicate = rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    brick_class = graph.value(subject, predicate)

    if brick_class is None:
        return None
    
    return brick_class.replace('https://brickschema.org/schema/Brick#', '')

In [29]:
summary_df['brick_class_ttl'] = summary_df['stream_id'].apply(get_brick_class, args=(g,))
summary_df.head()

Unnamed: 0,filename,building,stream_id,brick_class_pkl,brick_class_ttl,brick_class_csv,data_points,min_time,max_time,min_value,mean_value,median_value,max_value,brick_definition
8302,trainAll_0.pkl,A,9ba955fa_5960_4c9b_b73a_10156da7d083,Operating_Mode_Status,,Operating_Mode_Status,12945,2021-01-01 00:09:44.728,2021-03-31 22:51:10.692,0.0,0.999382,1.0,1.0,Measures the temperature of water
8453,trainAll_1.pkl,A,8fd6e75b_88bc_4992_b420_77389969b3c4,Mode_Command,,Mode_Command,12924,2021-01-01 00:00:33.778,2021-03-31 22:52:05.233,0.0,0.0,0.0,0.0,Measures the temperature of water
7767,trainAll_10.pkl,A,4c7f5f97_30d2_40e8_baf9_cba39f01b504,Alarm,,Alarm,12944,2021-01-01 00:09:36.428,2021-03-31 22:50:59.736,0.0,0.0,0.0,0.0,Measures the temperature of water
5986,trainAll_100.pkl,A,5696de06_220a_488c_a454_c8afb26138a1,Electrical_Power_Sensor,,Electrical_Power_Sensor,12952,2021-01-01 00:09:41.171,2021-03-31 22:51:05.775,0.0,4085.557674,4088.0,4205.0,Measures the temperature of water
3461,trainAll_1000.pkl,A,a705c5f2_3f26_4251_99aa_45023a69be97,Alarm,,Alarm,12925,2021-01-01 00:00:35.556,2021-03-31 22:52:07.245,0.0,0.0,0.0,0.0,Measures the temperature of water


In [30]:
b = Graph().parse(brick_schema)

In [33]:
def get_brick_definition(brick_class, graph):
    if brick_class is None:
        return None
    subject = BRICK[brick_class]
    predicate = rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#definition')
    definition = graph.value(subject, predicate)
    return definition

In [34]:
summary_df['brick_definition'] = summary_df['brick_class_ttl'].apply(get_brick_definition, args=(b,))
summary_df.head()

Unnamed: 0,filename,building,stream_id,brick_class_pkl,brick_class_ttl,brick_class_csv,data_points,min_time,max_time,min_value,mean_value,median_value,max_value,brick_definition
8302,trainAll_0.pkl,A,9ba955fa_5960_4c9b_b73a_10156da7d083,Operating_Mode_Status,,Operating_Mode_Status,12945,2021-01-01 00:09:44.728,2021-03-31 22:51:10.692,0.0,0.999382,1.0,1.0,
8453,trainAll_1.pkl,A,8fd6e75b_88bc_4992_b420_77389969b3c4,Mode_Command,,Mode_Command,12924,2021-01-01 00:00:33.778,2021-03-31 22:52:05.233,0.0,0.0,0.0,0.0,
7767,trainAll_10.pkl,A,4c7f5f97_30d2_40e8_baf9_cba39f01b504,Alarm,,Alarm,12944,2021-01-01 00:09:36.428,2021-03-31 22:50:59.736,0.0,0.0,0.0,0.0,
5986,trainAll_100.pkl,A,5696de06_220a_488c_a454_c8afb26138a1,Electrical_Power_Sensor,,Electrical_Power_Sensor,12952,2021-01-01 00:09:41.171,2021-03-31 22:51:05.775,0.0,4085.557674,4088.0,4205.0,
3461,trainAll_1000.pkl,A,a705c5f2_3f26_4251_99aa_45023a69be97,Alarm,,Alarm,12925,2021-01-01 00:00:35.556,2021-03-31 22:52:07.245,0.0,0.0,0.0,0.0,


In [39]:
summary_df.sort_values(by=['building', 'brick_class_pkl', 'brick_class_ttl', 'filename'], inplace=True)
summary_df.insert(5, 'brick_class_ttl', summary_df.pop('brick_class_ttl'))
summary_df.head()

Unnamed: 0,filename,building,stream_id,brick_class_pkl,brick_class_csv,brick_class_ttl,data_points,min_time,max_time,min_value,mean_value,median_value,max_value,brick_definition
3078,trainAll_1010.pkl,A,985db5d1_04e4_41cc_b954_68a6bc41f309,Active_Power_Sensor,Active_Power_Sensor,,10648,2021-01-01 00:13:50.687,2021-03-31 22:55:08.042,0.0,0.43248,0.4634,0.5265,
1444,trainAll_1062.pkl,A,b95f79bb_0df0_4395_a3aa_f49beda0d401,Active_Power_Sensor,Active_Power_Sensor,,7101,2021-01-01 00:14:07.051,2021-03-31 22:55:21.914,0.0,1.273096,1.1124,6.3675,
360,trainAll_1064.pkl,A,49cc1fa6_431c_40e0_b151_1adf32169dcd,Active_Power_Sensor,Active_Power_Sensor,,10671,2021-01-01 00:13:50.408,2021-03-31 22:55:07.875,0.0,0.714675,0.637,1.19,
205,trainAll_1071.pkl,A,4c15ddb0_d74e_4938_a774_99c92be49f23,Active_Power_Sensor,Active_Power_Sensor,,11659,2021-01-01 00:03:26.957,2021-03-31 22:55:02.216,0.0,1.460795,1.8466,2.1414,
7439,trainAll_1085.pkl,A,a7918557_0c82_4341_ae95_52024debeedb,Active_Power_Sensor,Active_Power_Sensor,,12912,2021-01-01 00:02:07.587,2021-03-31 22:53:47.224,0.0,19.191632,18.6096,36.5542,


In [40]:
summary_df.to_csv('bts_site_b_train_summary.csv', index=False)