In [691]:
from pathlib import Path
import brickschema
import numpy as np
import pandas as pd
import rdflib
from rdflib import Namespace
from rdflib.namespace import RDFS, SKOS, BRICK


In [None]:
SENAPS = Namespace("http://senaps.io/schema/1.0/senaps#")
SENAPS['stream_id']

In [None]:
SKOS['definition']

In [None]:
SKOS

In [695]:
dataset_dir = '../../datasets/bts_site_b_train/'

dataset_zip = 'train.zip'
dataset_path = Path(dataset_dir) / dataset_zip

mapping_csv = 'mapper_TrainOnly.csv'
mapping_path = Path(dataset_dir) / mapping_csv

# building_ttl = 'Site_B_tim.ttl'
building_ttl = 'Site_B.ttl'
building_model = Path(dataset_dir) / building_ttl

brick_ttl = 'Brick_v1.2.1.ttl'
brick_schema = Path(dataset_dir) / brick_ttl

In [696]:
g_building = brickschema.Graph().load_file(building_model)
g_brick = brickschema.Graph().load_file(brick_schema)
# g_brick_latest = brickschema.Graph(load_brick=True)
g_brick_latest = brickschema.Graph(load_brick_nightly=True)

# ANALYSIS

In [697]:
def sparql_to_df(g, q, **kwargs):
    res = g.query(q, **kwargs)
    df = pd.DataFrame(res.bindings)
    # are these necessary?
    df.columns = df.columns.map(str)
    # df = df.map(str)
    df.drop_duplicates(inplace=True)
    return df

In [None]:
# Get all brick entities and their classes in the building model
def get_brick_entities(g):
    q = '''
    SELECT ?entity ?brick_class ?stream_id ?named_unit ?anonymous_unit WHERE {
        ?entity a ?brick_class .
        OPTIONAL { ?entity senaps:stream_id ?stream_id } .
        OPTIONAL { ?entity brick:hasUnit ?named_unit .
                    filter ( strstarts(str(?named_unit),str(unit:)) ) } .
        OPTIONAL { ?entity brick:hasUnit [ brick:value ?anonymous_unit ] } .
        filter ( strstarts(str(?brick_class),str(brick:)) ) .
    }
    '''
    # q = '''
    # SELECT ?entity ?brick_class ?stream_id WHERE {
    #     ?entity a ?brick_class .
    #     OPTIONAL { ?entity senaps:stream_id ?stream_id } .
    #     filter ( strstarts(str(?brick_class),str(brick:)) ) .
    # }
    # '''
    return sparql_to_df(g, q)

get_brick_entities(g_building)

In [699]:
df = get_brick_entities(g_building)

In [700]:
def class_in_brick(cls, g):
    return (cls, None, None) in g

In [None]:
df['class_in_provided_brick'] = df['brick_class'].apply(class_in_brick, args=(g_brick,))
print(len(df[df['class_in_provided_brick'] == True]), 'recognised by provided Brick schema:')
print(df[df['class_in_provided_brick'] == True].head())
print(len(df[df['class_in_provided_brick'] == False]), 'not recognised by provided Brick schema:')
print(df[df['class_in_provided_brick'] == False].head())

In [None]:
df['class_in_latest_brick'] = df['brick_class'].apply(class_in_brick, args=(g_brick_latest,))
print(len(df[df['class_in_latest_brick'] == True]), 'recognised by provided Brick schema:')
print(df[df['class_in_latest_brick'] == True].head())
print(len(df[df['class_in_latest_brick'] == False]), 'not recognised by provided Brick schema:')
print(df[df['class_in_latest_brick'] == False].head())

In [703]:
def get_brick_definition(cls, g, g_alt=None):
    if cls is None:
        return None
    
    # predicate = rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#definition')
    predicate = SKOS['definition']
    definition = g.value(subject=cls, predicate=predicate)
    
    original_cls = cls
    while definition is None:
        cls = g.value(subject=cls, predicate=RDFS['subClassOf'])
        # print(cls)
        if cls is None:
            break
        definition = g.value(subject=cls, predicate=predicate)
    
    if definition is None and g_alt is not None:
        return get_brick_definition(original_cls, g_alt)
    
    return g.value(subject=cls, predicate=predicate)

In [None]:
df['brick_definition'] = df['brick_class'].apply(get_brick_definition, args=(g_brick,))
# df['brick_definition'] = df['class'].apply(get_brick_definition, args=(g_brick_latest,))
# df['brick_definition'] = df['class'].apply(get_brick_definition, args=(g_brick, g_brick_latest))
df.head()

In [705]:
if 'named_unit' not in df.columns:
    df['named_unit'] = None
if 'anonymous_unit' not in df.columns:
    df['anonymous_unit'] = None


df = df.assign(unit=lambda x: x['named_unit'].combine_first(x['anonymous_unit']))

# def unit_in_brick(unit, g):
#     return (unit, None, None) in g


# df['unit_in_provided_brick'] = df['named_unit'].apply(class_in_brick, args=(g_brick,))

In [706]:
def unit_is_named(r):
    if pd.isna(r.unit):
        return None
    
    return not pd.isna(r.named_unit)

In [None]:
df['unit_is_named'] = df.apply(unit_is_named, axis=1)
df.head()

In [708]:
# df.assign(unit_is_named=lambda x: x['unit'].apply(lambda u: u is not None and u.startswith('unit:')))   

In [None]:
# Load the mapping file
mapping_df = pd.read_csv(mapping_path, index_col=0)

# Building B only
# mapping_df = mapping_df[mapping_df['Building'] == 'B']

# Ignore streams not saved to file
mapping_df = mapping_df[mapping_df['Filename'].str.contains('FILE NOT SAVED') == False]

mapping_df.head()

In [None]:
df['stream_id'][0]

In [711]:
def stream_exists_in_mapping(s, mapping_df):
    if pd.isna(s):
        return None
    return str(s).strip() in mapping_df['StreamID'].values

In [None]:
df['stream_exists_in_mapping'] = df['stream_id'].apply(stream_exists_in_mapping, args=(mapping_df,))
df.head()

In [713]:
def brick_class_in_mapping(s, mapping_df):
    if pd.isna(s):
        return None
    mapping_df['StreamID']
    return str(s).strip() in mapping_df['StreamID'].values

In [None]:
# Convert df['stream_id'] to string for the join
df['stream_id_str'] = df['stream_id'].apply(lambda x: str(x))

# Perform the left join
df = pd.merge(df, mapping_df[['StreamID', 'strBrickLabel']], how='left', left_on='stream_id_str', right_on='StreamID')

# Optionally drop the temporary column 'stream_id_str' and 'StreamID' after the merge
df = df.drop(columns=['stream_id_str', 'StreamID'])
df.rename(columns={'strBrickLabel': 'brick_class_in_mapping'}, inplace=True)

df.head()

In [None]:
df['brick_class_fragment'] = df['brick_class'].apply(lambda x: str(x.fragment) if x is not None else None)

df['brick_class_is_consistent'] = np.where(
    pd.isna(df['brick_class_in_mapping']),  # Check if brick_class_in_mapping is empty
    None,  # Leave empty where there's no mapping value
    df['brick_class_fragment'] == df['brick_class_in_mapping']  # Compare fragment with the mapping
)

df.drop(columns=['brick_class_fragment'], inplace=True)

df.head()

In [716]:
def defrag_uri(s):
    if isinstance(s, rdflib.term.URIRef):
        if '#' in s:
            return s.fragment
        elif '/' in s:
            return s.split('/')[-1]
    return s

In [717]:
for col in df.columns:
    df[col] = df[col].apply(defrag_uri)

In [718]:
df.to_csv('model_quality.csv', index=False)

---
# VISUALISATION

In [719]:
# import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Brick Entities in Building Model Recognised by Brick Schema

In [None]:
entity_in_provided_brick = df[['brick_class', 'entity', 'class_in_provided_brick']].copy()
entity_in_provided_brick.sort_values(by=['class_in_provided_brick', 'brick_class', 'entity'], inplace=True)
entity_in_provided_brick.head()

In [None]:
to_plot = entity_in_provided_brick[['brick_class', 'class_in_provided_brick']].groupby('class_in_provided_brick').count()
to_plot.reset_index(inplace=True)
to_plot['class_in_provided_brick'] = to_plot['class_in_provided_brick'].apply(lambda x: 'Recognised' if x else 'Unrecognised')
to_plot.head()

In [None]:
import plotly.express as px
fig = px.pie(to_plot, values='brick_class', names='class_in_provided_brick',
             title='Proportion of Brick Entities Recognised by Provided Brick Schema',
            #  hover_data={'brick_class': True, 'class_in_provided_brick': False}, 
            #  hover_data={'brick_class': True, 'class_in_provided_brick': False}, 
            #  labels={'brick_class':'Number of Entities'})
             labels={'class_in_provided_brick': 'Class', 'brick_class':'Number of Entities'})
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
to_plot = entity_in_provided_brick[['brick_class', 'class_in_provided_brick']].copy()
# to_plot.reset_index(inplace=True)
to_plot['class_in_provided_brick'] = to_plot['class_in_provided_brick'].apply(lambda x: 'Recognised' if x else 'Unrecognised')
to_plot.head()

In [None]:
fig = make_subplots(
    rows=2, cols=1,
    # shared_xaxes=True,
    vertical_spacing=0.03,
    specs=[[{"type": "pie"}],
           [{"type": "table"}]]
)

labels = to_plot['class_in_provided_brick'].value_counts().index
values = to_plot['class_in_provided_brick'].value_counts().values

fig.add_trace(
    go.Pie(#to_plot, 
        #    values='brick_class', 
            labels=labels,
              values=values,
              textposition='inside', 
              textinfo='percent+label',
              name="",
        #    names='class_in_provided_brick',
        #    title='Proportion of Brick Entities Recognised by Provided Brick Schema',
        #    labels={'class_in_provided_brick': 'Class', 'brick_class':'Number of Entities'}
    ),
    row=1, col=1
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Entity ID", "Class Recognised"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns],
            align = "left")
    ),
    row=2, col=1
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Brick Entities in Building Model Recognised by Brick Schema",
)

fig.show()


In [None]:
fig = make_subplots(
    rows=3, cols=1,
    # shared_xaxes=True,
    # vertical_spacing=0.1,
    vertical_spacing=0.03,
    subplot_titles = ['Proportion of Entities', 'Recognised Entities', 'Unrecognised Entities'],
    specs=[[{"type": "pie"}],
           [{"type": "table"}],
           [{"type": "table"}]]
)

labels = to_plot['class_in_provided_brick'].value_counts().index
values = to_plot['class_in_provided_brick'].value_counts().values

fig.add_trace(
    go.Pie(#to_plot, 
        #    values='brick_class', 
            labels=labels,
              values=values,
              textposition='inside', 
              textinfo='percent+label',
              name="",
        #    names='class_in_provided_brick',
        #    title='Proportion of Brick Entities Recognised by Provided Brick Schema',
        #    labels={'class_in_provided_brick': 'Class', 'brick_class':'Number of Entities'}
    ),
    row=1, col=1
)

entity_in_provided_brick_true = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == True]
entity_in_provided_brick_false = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == False]

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Entity ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[entity_in_provided_brick_true[k].tolist() for k in entity_in_provided_brick_true.columns[:2]],
            align = "left")
    ),
    row=2, col=1
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Entity ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[entity_in_provided_brick_false[k].tolist() for k in entity_in_provided_brick_false.columns[:2]],
            align = "left")
    ),
    row=3, col=1,
)


fig.update_layout(
    height=1200,
    showlegend=False,
    title_text="Brick Entities in Building Model Recognised by Brick Schema",
)

fig.show()

In [None]:
fig = make_subplots(
    rows=3, cols=2,
    # shared_xaxes=True,
    # vertical_spacing=0.1,
    # vertical_spacing=0.03,
    vertical_spacing=0.05,
    subplot_titles = ['Proportion of Entities', 'Unrecognised by Class', 'Unrecognised Entities', 'Recognised Entities'],
    specs=[[{"type": "pie"}, {"type": "pie"}],
           [{"type": "table", 'colspan': 2}, None],
           [{"type": "table", 'colspan': 2}, None]]
)

labels = to_plot['class_in_provided_brick'].value_counts().index
values = to_plot['class_in_provided_brick'].value_counts().values

fig.add_trace(
    go.Pie(#to_plot, 
        #    values='brick_class', 
            labels=labels,
              values=values,
              textposition='inside', 
              textinfo='percent+label',
              name="",
        #    names='class_in_provided_brick',
        #    title='Proportion of Brick Entities Recognised by Provided Brick Schema',
        #    labels={'class_in_provided_brick': 'Class', 'brick_class':'Number of Entities'}
    ),
    row=1, col=1
)

entity_in_provided_brick_true = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == True]
entity_in_provided_brick_false = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == False]

labels = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == False]['brick_class'].value_counts().index
values = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == False]['brick_class'].value_counts().values

fig.add_trace(
    go.Pie(#to_plot, 
        #    values='brick_class', 
            labels=labels,
              values=values,
              textposition='inside', 
              textinfo='percent+label',
            #   textinfo='value+label',
              name="",
        #    names='class_in_provided_brick',
        #    title='Proportion of Brick Entities Recognised by Provided Brick Schema',
        #    labels={'class_in_provided_brick': 'Class', 'brick_class':'Number of Entities'}
    ),
    row=1, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Entity ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[entity_in_provided_brick_false[k].tolist() for k in entity_in_provided_brick_false.columns[:2]],
            align = "left")
    ),
    row=2, col=1,
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Entity ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[entity_in_provided_brick_true[k].tolist() for k in entity_in_provided_brick_true.columns[:2]],
            align = "left")
    ),
    row=3, col=1
)

fig.update_layout(
    height=1200,
    showlegend=False,
    title_text="Brick Entities in Building Model Recognised by Brick Schema",
    title_x=0.5,
)

fig.show()

In [727]:
fig.write_html("brick_entities_recognised_by_schema.html")

## Data Sources in Building Model with Associated Units

In [None]:
stream_with_units = df[['brick_class', 'stream_id', 'unit', 'unit_is_named']].copy()
stream_with_units.dropna(subset=['stream_id'], inplace=True)
stream_with_units.sort_values(by=['brick_class', 'stream_id'], inplace=True)
# entity_in_provided_brick.sort_values(by=['class_in_provided_brick', 'brick_class', 'entity'], inplace=True)
stream_with_units['has_unit'] = stream_with_units['unit'].apply(lambda x: 'No units' if pd.isna(x) else 'Units')
stream_with_units.head()

In [None]:
streams_without_units = stream_with_units[pd.isna(stream_with_units['unit'])].copy()
streams_without_units.sort_values(by=['brick_class', 'stream_id'], inplace=True)
streams_without_units.head()

In [None]:
number_with_units_labels = stream_with_units['has_unit'].value_counts().index
number_with_units_values = stream_with_units['has_unit'].value_counts().values
print(number_with_units_labels)
print(number_with_units_values)

In [None]:
stream_with_named_units = stream_with_units.dropna(subset=['unit']).copy()
stream_with_named_units['has_named_unit'] = stream_with_units['unit_is_named'].apply(lambda x: 'Machine readable' if x else 'Not machine readable')
stream_with_named_units.head()

In [None]:
streams_with_anonymous_units = stream_with_units[stream_with_units['unit_is_named'] == False]
streams_with_anonymous_units.head()

In [None]:
number_with_named_units_labels = stream_with_named_units['has_named_unit'].value_counts().index
number_with_named_units_values = stream_with_named_units['has_named_unit'].value_counts().values
print(number_with_named_units_labels)
print(number_with_named_units_values)

In [None]:
fig = make_subplots(
    rows=3, cols=2,
    vertical_spacing=0.05,
    subplot_titles = ['Proportion of Streams with Units', 'Units that are Machine Readable', 'Streams without Units', 'Streams with Non-Machine Readable Units'],
    specs=[[{"type": "pie"}, {"type": "pie"}],
           [{"type": "table", 'colspan': 2}, None],
           [{"type": "table", 'colspan': 2}, None]]
)

fig.add_trace(
    go.Pie(
        labels=number_with_units_labels,
        values=number_with_units_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=1
)

entity_in_provided_brick_true = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == True]
entity_in_provided_brick_false = entity_in_provided_brick[entity_in_provided_brick['class_in_provided_brick'] == False]

fig.add_trace(
    go.Pie( 
        labels=number_with_named_units_labels,
        values=number_with_named_units_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Stream ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[streams_without_units[k].tolist() for k in streams_without_units.columns[:2]],
            align = "left")
    ),
    row=2, col=1,
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Stream ID", "Units"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            # values=[entity_in_provided_brick[k].tolist() for k in entity_in_provided_brick.columns[1:]],
            values=[streams_with_anonymous_units[k].tolist() for k in streams_with_anonymous_units.columns[:3]],
            align = "left")
    ),
    row=3, col=1
)

fig.update_layout(
    height=1200,
    showlegend=False,
    title_text="Data Sources in Building Model with Associated Units",
    title_x=0.5,
)

fig.show()

In [735]:
fig.write_html("data_sources_with_associated_units.html")

## Data Sources in Building Model without Timeseries Data

In [None]:
have_data = df[['brick_class', 'stream_id', 'stream_exists_in_mapping']].copy()
have_data.dropna(subset=['stream_id'], inplace=True)
have_data.sort_values(by=['brick_class', 'stream_id'], inplace=True)
have_data['has_data'] = have_data['stream_exists_in_mapping'].apply(lambda x: 'Data' if x else 'No data')
have_data.head()

In [None]:
number_with_data_labels = have_data['has_data'].value_counts().index
number_with_data_values = have_data['has_data'].value_counts().values
print(number_with_data_labels)
print(number_with_data_values)

In [None]:
missing_data_by_class = have_data[have_data['stream_exists_in_mapping'] == False].copy()
missing_data_by_class = missing_data_by_class.groupby('brick_class').count()
missing_data_by_class.head()

In [None]:
missing_data_by_class_labels = have_data[have_data['stream_exists_in_mapping'] == False]['brick_class'].value_counts().index
missing_data_by_class_values = have_data[have_data['stream_exists_in_mapping'] == False]['brick_class'].value_counts().values
print(missing_data_by_class_labels)
print(missing_data_by_class_values)

In [740]:
missing_data_true = have_data[have_data['stream_exists_in_mapping'] == True]
missing_data_false = have_data[have_data['stream_exists_in_mapping'] == False]

In [None]:
fig = make_subplots(
    rows=3, cols=2,
    vertical_spacing=0.05,
    subplot_titles = ['Proportion of Data Sources', 'Missing by Class', 'Data Sources with Missing Timeseries Data', 'Data Sources with Available Timeseries Data'],
    specs=[[{"type": "pie"}, {"type": "pie"}],
           [{"type": "table", 'colspan': 2}, None],
           [{"type": "table", 'colspan': 2}, None]]
)

fig.add_trace(
    go.Pie(
        labels=number_with_data_labels,
        values=number_with_data_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=1
)


fig.add_trace(
    go.Pie(
        labels=missing_data_by_class_labels,
        values=missing_data_by_class_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Stream ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[missing_data_false[k].tolist() for k in missing_data_false.columns[:2]],
            align = "left")
    ),
    row=2, col=1,
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class", "Stream ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[missing_data_true[k].tolist() for k in missing_data_true.columns[:2]],
            align = "left")
    ),
    row=3, col=1
)

fig.update_layout(
    height=1200,
    showlegend=False,
    title_text="Data Sources in Building Model without Timeseries Data",
    title_x=0.5,
)

fig.show()

In [742]:
fig.write_html("data_sources_without_data.html")

## Data Sources with Inconsistent Brick Class between Model and Mapper

In [None]:
class_consistency = df[['brick_class', 'brick_class_in_mapping', 'entity', 'brick_class_is_consistent']].copy()
class_consistency.dropna(subset=['brick_class_in_mapping'], inplace=True)
class_consistency.sort_values(by=['brick_class', 'brick_class_in_mapping', 'entity'], inplace=True)
class_consistency['consistency'] = class_consistency['brick_class_is_consistent'].apply(lambda x: 'Consistent' if x else 'Inconsistent')
class_consistency.head()

In [None]:
number_consistency_labels = class_consistency['consistency'].value_counts().index
number_consistency_values = class_consistency['consistency'].value_counts().values
print(number_consistency_labels)
print(number_consistency_values)

In [None]:
inconsistent_classes = class_consistency[class_consistency['brick_class_is_consistent'] == False].copy()
inconsistent_classes.head()

In [None]:
consistency_by_class_labels = inconsistent_classes['brick_class'].value_counts().index
consistency_by_class_values = inconsistent_classes['brick_class'].value_counts().values
print(consistency_by_class_labels)
print(consistency_by_class_values)

In [None]:
fig = make_subplots(
    rows=2, cols=2,
    # shared_xaxes=True,
    # vertical_spacing=0.1,
    vertical_spacing=0.05,
    subplot_titles = ['Proportion of Data Sources', 'Inconsistent by Class', 'Data Sources with Inconsistent Brick Class'],
    specs=[[{"type": "pie"}, {"type": "pie"}],
           [{"type": "table", 'colspan': 2}, None]]
)

fig.add_trace(
    go.Pie(
        labels=number_consistency_labels,
        values=number_consistency_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=1
)


fig.add_trace(
    go.Pie(
        labels=consistency_by_class_labels,
        values=consistency_by_class_values,
        textposition='inside', 
        textinfo='percent+label',
        name="",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["Brick Class in Model", "Brick Class in Mapper", "Entity ID"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[inconsistent_classes[k].tolist() for k in inconsistent_classes.columns[:3]],
            align = "left")
    ),
    row=2, col=1,
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Data Sources with Inconsistent Brick Class between Model and Mapper",
    title_x=0.5,
)

fig.show()

In [748]:
fig.write_html("data_sources_with_inconsistent_class.html")

---

# SCRATCH

In [None]:
# streams with blank node units
q = '''
SELECT ?a ?b
WHERE {
    ?a brick:hasUnit [ brick:value ?b ] .
}
'''
sparql_to_df(g_building, q)

In [None]:
# streams with blank node units
q = '''
SELECT ?a ?b
WHERE {
    ?a brick:hasUnit ?b .
    ?b rdf:type/rdfs:subClassOf* unit: .
}
'''
sparql_to_df(g_building, q)

In [None]:
# streams with proper units
q = '''
SELECT ?a ?b
WHERE {
    ?a brick:hasUnit ?b .
    filter ( strstarts(str(?b),str(unit:)) ) .
}
'''
sparql_to_df(g_building, q)

In [None]:
for ns_prefix, namespace in g_building.namespaces():
    print(f'{ns_prefix}: {namespace}')

In [245]:
df.to_csv('model_quality_defrag.csv', index=False)

In [None]:
def get_anonymous_units(s, g):
    q = '''
    SELECT ?units
    WHERE {
        ?entity brick:hasUnit [ brick:value ?units ] .
    }
    '''
    return sparql_to_df(g, q, initBindings={'entity': s})

df['anonymous_units'] = df['entity'].apply(get_anonymous_units, args=(g_building,))

df.head()

In [None]:
def get_named_units(s, g):
    # streams with proper units
    q = '''
    SELECT ?named_units
    WHERE {
        ?entity brick:hasUnit ?named_units .
        # filter ( strstarts(str(?b),str(unit:)) ) .
    }
    '''
    return sparql_to_df(g, q, initBindings={'entity': s})

df['named_units'] = df['entity'].apply(get_named_units, args=(g_building,))

df.head()

In [None]:
# df['stream_exists_in_mapping'] = np.where(df['stream_id'].isna(), np.nan, df['stream_id'].str.isin(mapping_df['StreamID']))

df['stream_exists_in_mapping'] = np.where(
    df['stream_id'].apply(lambda x: pd.isna(str(x).strip())), pd.NA,  # Handle empty Literal
    df['stream_id'].apply(lambda x: str(x)).isin(mapping_df['StreamID'])  # Convert Literal to string and check
)
# df['stream_exists_in_mapping'] = df['stream_exists_in_mapping'].astype('boolean')
df['stream_exists_in_mapping'] = df['stream_exists_in_mapping'].apply(
    lambda x: pd.NA if pd.isna(x) else bool(x)
)

df.head()
# df[('stream_id' == '')]
# df[df['stream_id'] != '']
df[pd.isna(df['stream_id'])]
# for val in df['stream_id']:
#     print(val)