# Avantis Mapping Instructions

The following describes the process for mapping the current Avantis Classes to the new Classification system. 

## SQL connection
The first step connects to the Avantis SQL server to get the list of entities

In [29]:
import pyodbc
import pandas as pd
import os
from sqlalchemy.engine import URL
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

connect = 'DSN=Avantis6-P;UID='+ os.environ['Avantis_User'] + ';PWD=' + os.environ['Avantis_Pass']
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connect})

engine = create_engine(connection_url)

SQL1 = """SELECT Distinct MAINTENT.id as [Entity_number],
[MAINTENT].[aenm] as [Description],
MAINTENT2.id as [Parent],
MAINTENT2.aenm as [Parent_Description],
[contname] as [Category],
[entclsid] as [Class],
SUSPEND.suspoi as [Suspended]

FROM  [AvantisP].[mc].[MAINTENT]
	  Left Join [AvantisP].[mc].[ENTCLASS] on MAINTENT.entclsref_oi = ENTCLASS.entcloi
	  Left Join [AvantisP].[mc].CATVAL ON MAINTENT.cat1_oi = CATVAL.cvoi
	  Left Join MC.SUSPEND ON MAINTENT.susp_oi = SUSPEND.suspoi
	  Left Join MC.MELINK ON MAINTENT.mtnoi = MELINK.mtnchild_oi
	  Left Join MC.MAINTENT MAINTENT2 ON MELINK.mtnparn_oi = MAINTENT2.mtnoi
WHERE
    (SUSPEND.audt_updted_dttm <= '1900-01-01 23:59:59' OR SUSPEND.audt_updted_dttm IS NULL) AND
    (MAINTENT.okcost = 1)
"""


df = pd.read_sql(SQL1,engine)
df = df[df['Entity_number'].values != None]
df.head(10)

Unnamed: 0,Entity_number,Description,Parent,Parent_Description,Category,Class,Suspended
0,\tFCL_ELS_CBL_001L,"Electrical Power Line,4.16KV,From BUS-00B1-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
1,\tFCL_ELS_CBL_002D,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
2,\tFCL_ELS_CBL_002F,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
3,\tFCL_ELS_CBL_002H,"Electrical Power Line,4.16KV,From BUS-00B1 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
4,\tFCL_ELS_CBL_002L,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
5,\tFCL_ELS_CBL_00B2-A,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
6,\tFCL_ELS_CBL_0100,"Electrical Power Line,From BUS-00L1 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,
7,\tFCL_ELS_CBL_0200,"Electrical Power Line,From BUS-00L2 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,
8,\tFCL_ELS_CBL_02CF,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,
9,17513,"Piping, New Wet Well",TAB-DEW-WEL,Centrate Wet Wells,Piping,Piping,


Here is were we capture the 13040 tag section if it is present in the entity number

In [30]:
import re

df_13040 = pd.read_excel('13040 Codes.xlsx', sheet_name='Append1')
dict_13040 = dict(zip(df_13040['CODE'], df_13040['DESCRIPTION']))

def extract_code(entity):
    # Use regex to match the third group between dashes
    match = re.match(r'^[^-]+-[^-]+-([^-]+)-[^-]+$', str(entity))
    return match.group(1) if match else None

def code_in_dict(code):
    return code if code in dict_13040 else None

df['Extracted_Code'] = df['Entity_number'].apply(extract_code)
df['Matched_Code'] = df['Extracted_Code'].apply(code_in_dict)
df.head(10)

Unnamed: 0,Entity_number,Description,Parent,Parent_Description,Category,Class,Suspended,Extracted_Code,Matched_Code
0,\tFCL_ELS_CBL_001L,"Electrical Power Line,4.16KV,From BUS-00B1-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
1,\tFCL_ELS_CBL_002D,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
2,\tFCL_ELS_CBL_002F,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
3,\tFCL_ELS_CBL_002H,"Electrical Power Line,4.16KV,From BUS-00B1 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
4,\tFCL_ELS_CBL_002L,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
5,\tFCL_ELS_CBL_00B2-A,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
6,\tFCL_ELS_CBL_0100,"Electrical Power Line,From BUS-00L1 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,,,
7,\tFCL_ELS_CBL_0200,"Electrical Power Line,From BUS-00L2 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,,,
8,\tFCL_ELS_CBL_02CF,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,
9,17513,"Piping, New Wet Well",TAB-DEW-WEL,Centrate Wet Wells,Piping,Piping,,,


## SPARQL connection
The following will now load the data from the OWL file

In [31]:
from rdflib import Graph
import pandas as pd

# Load the RDFS .ttl file into an RDFLib Graph
g = Graph()
g.parse(r"../TWONTO/OWL/TWONTO.ttl", format="turtle")

# Prepare a custom SPARQL query
query = """

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX tw: <http://www.toronto.ca/TWONTO#>

SELECT DISTINCT ?label ?subject
       (COALESCE(?isEquivalentToCategory, "") AS ?isEquivalentToCategory)
       (COALESCE(?isEquivalentToClass, "") AS ?isEquivalentToClass)
       (COALESCE(?isSuperclassToCategory, "") AS ?isSuperclassToCategory)
       (COALESCE(?isSuperclassToClass, "") AS ?isSuperclassToClass)
       (COALESCE(?is_equivalent_to_tag_code, "") AS ?is_equivalent_to_tag_code)
       (COALESCE(?is_superclass_to_tag_code, "") AS ?is_superclass_to_tag_code)
WHERE {
  ?subject rdfs:label ?label .

#  FILTER NOT EXISTS { ?subject owl:deprecated true }

#  ?assetClass rdfs:label "asset" .
#  ?subject rdfs:subClassOf+ ?assetClass .

  OPTIONAL { ?subject tw:is_equivalent_to_Avantis_category ?isEquivalentToCategory . }
  OPTIONAL { ?subject tw:is_equivalent_to_Avantis_class ?isEquivalentToClass . }
  OPTIONAL { ?subject tw:is_superclass_to_Avantis_category ?isSuperclassToCategory . }
  OPTIONAL { ?subject tw:is_superclass_to_Avantis_class ?isSuperclassToClass . }
  OPTIONAL { ?subject tw:is_equivalent_to_tag_code ?is_equivalent_to_tag_code . }
  OPTIONAL { ?subject tw:is_superclass_to_tag_code ?is_superclass_to_tag_code . }
}
"""

# Execute the query
results = g.query(query)

# Convert results to a DataFrame, do not replace None with empty string
data = []
for row in results:
    data.append([
        str(row.label) if row.label else None,
        str(row.subject) if row.subject else None,
        str(row.isEquivalentToCategory) if row.isEquivalentToCategory else None,
        str(row.isEquivalentToClass) if row.isEquivalentToClass else None,
        str(row.isSuperclassToCategory) if row.isSuperclassToCategory else None,
        str(row.isSuperclassToClass) if row.isSuperclassToClass else None,
        str(row.is_equivalent_to_tag_code) if row.is_equivalent_to_tag_code else None,
        str(row.is_superclass_to_tag_code) if row.is_superclass_to_tag_code else None
    ])

df_sparql = pd.DataFrame(data, columns=[
    "label",
    "subject",
    "isEquivalentToCategory",
    "isEquivalentToClass",
    "isSuperclassToCategory",
    "isSuperclassToClass",
    "is_equivalent_to_tag_code",
    "is_superclass_to_tag_code"
])

df_sparql.head(10)

Unnamed: 0,label,subject,isEquivalentToCategory,isEquivalentToClass,isSuperclassToCategory,isSuperclassToClass,is_equivalent_to_tag_code,is_superclass_to_tag_code
0,new_13040_code_proposed,http://www.toronto.ca/TWONTO#new_13040_code_pr...,,,,,,
1,contains,http://www.toronto.ca/TWONTO#00865,,,,,,
2,created,http://www.toronto.ca/TWONTO#00866,,,,,,
3,fully grounds,http://www.toronto.ca/TWONTO#00867,,,,,,
4,grounds,http://www.toronto.ca/TWONTO#00868,,,,,,
5,grounds,http://www.toronto.ca/TWONTO#00407,,,,,,
6,has boiler heating surface area,http://www.toronto.ca/TWONTO#00869,,,,,,
7,has content part,http://www.toronto.ca/TWONTO#00870,,,,,,
8,has earlier version,http://www.toronto.ca/TWONTO#00871,,,,,,
9,has later version,http://www.toronto.ca/TWONTO#00872,,,,,,


Now to create the dictionary items for easy lookup later

In [32]:
dict_class = {k: v for k, v in zip(df_sparql['isEquivalentToClass'], df_sparql['label']) if k is not None}
dict_superclass = {k: v for k, v in zip(df_sparql['isSuperclassToClass'], df_sparql['label']) if k is not None}
dict_category = {k: v for k, v in zip(df_sparql['isEquivalentToCategory'], df_sparql['label']) if k is not None}
dict_supercategory = {k: v for k, v in zip(df_sparql['isSuperclassToCategory'], df_sparql['label']) if k is not None}
dict_tag = {k: v for k, v in zip(df_sparql['is_equivalent_to_tag_code'], df_sparql['label']) if k is not None}
dict_supertag = {k: v for k, v in zip(df_sparql['is_superclass_to_tag_code'], df_sparql['label']) if k is not None}
dict_iri = {k: v for k, v in zip(df_sparql['subject'], df_sparql['label']) if k is not None}
dict_iri['x'] = 'unknown'

for key, value in list(dict_class.items())[:10]:
    print(f"Avantis Class: {key} -> New Class: {value}")

Avantis Class: Vehicle -> New Class: passenger vehicle
Avantis Class: Fan -> New Class: fan
Avantis Class: Structure -> New Class: structure
Avantis Class: Power Supply Unit -> New Class: DC power supply
Avantis Class: Programmable Logic Control -> New Class: PLC
Avantis Class: Remote Processor Unit -> New Class: RPU panel
Avantis Class: Remote Transmission Unit -> New Class: RPU panel
Avantis Class: Uninterruptible Power Supply -> New Class: UPS
Avantis Class: Ultraviolet Disinfection -> New Class: UV disinfection assembly
Avantis Class: Variable Frequency Drive -> New Class: VFD


## Manual Matching
Here we will load the manual matched file

In [33]:
df_manual = pd.read_excel('manualMatch.xlsx', sheet_name='LLM capability Test Dataset')
df['Entity_number'] = df['Entity_number'].astype(str).str.strip()
df_manual.head(10)

Unnamed: 0,ID,Entity_number,Description,Parent,Category,Class,Suspended,Tag,Valid_Class,TH Suggestion,Parent_Description
0,104978.0,NX5232A,"Chiller, Silo Bldg Control Room",TAB-ACC-CI,"H.V.A.C.,Chiller",HVAC,,,http://www.toronto.ca/TWONTO#00213,,Chiller and Air Coolers
1,119738.0,TAB-WA1-SQ-1984,"Lanyard 6 Ft- Velasco, Gabriel\t\t\t\t\t\t\t",TAB-WA1-SQ-0001,"PPE,Harness",Safety Equipment,,SQ,http://www.toronto.ca/TWONTO#00333,,WA1 Fall Arrest System
2,59232.0,THC-ACC-HTR-6025,"Heater, Unit, Electric, Heating System, Lower ...",THC-ELS-LP-4042A,"H.V.A.C.,Heater,Unit",HVAC,,HTR,http://www.toronto.ca/TWONTO#00418,,"Lighting Panel, Vortex Gallery, Headworks"
3,119966.0,TAB-WA4-SQ-3430,Fall Limiter - MFLT2/705F,TAB-WA4-SQ-0010,"PPE,Lanyard",Safety Equipment,,SQ,http://www.toronto.ca/TWONTO#00341,,WA4 Fall arrest systems
4,59390.0,THC-ACC-PDIT-6291,"Transmitter, Pressure Differential, Filter F-6...",THC-ACC-F-6291,"Transmitter,Pressure",HVAC,,PDIT,http://www.toronto.ca/TWONTO#00605,,"Filter, Outdoor Air Plenum, 2nd Floor, Mech Ro..."
5,76870.0,THR-PLT-FSL-2222,"Switch, Flow, Low, Water Scour, Aerated Grit T...",THR-PLT-T-0300,"Switch,Flow-Low",Switch,,FSL,http://www.toronto.ca/TWONTO#00367,,"Tank, Grit Removal, Grit Tanks, North Grit Bui..."
6,119721.0,TAB-WA1-SQ-1915,Body Harness Xlarge- Hitish Mistry,TAB-WA1-SQ-0001,"PPE,Harness",Safety Equipment,,SQ,http://www.toronto.ca/TWONTO#00332,,WA1 Fall Arrest System
7,76217.0,THR-MCS-LD-9868,"Come Along, Work Area 2",THR-LD-WA2,"Lifting Device,Chain",Lifting Device,,LD,http://www.toronto.ca/TWONTO#00812,,"Lifting Devices, Chain Hoist, Come Along & Slings"
8,52817.0,TAB-RSP-FSL-1221D,"Flow Switch, Low, Cooling Water Supply to Heat...",TAB-RSP-P-1220,"Switch,Flow",Switch,,FSL,http://www.toronto.ca/TWONTO#00367,,"Pump,Raw Sewage Transfer #2"
9,79780.0,THR-WA1-LD-0051,"Sling, Nylon, Work Area 1 (NYLON BELT SLING SI...",THR-LD-NYLN-WA1,"Lifting Device,Sling",Lifting Device,,LD,http://www.toronto.ca/TWONTO#00929,,"Slings, Nylon, Work Area 1"


In [34]:
dict_manual = {k: v for k, v in zip(df_manual['Entity_number'],df_manual['Valid_Class'].map(dict_iri)) if k is not None}

for key, value in list(dict_manual.items())[:10]:
    print(f"Avantis Manual Entity: {key} -> New Class: {value}")

Avantis Manual Entity: NX5232A -> New Class: chiller
Avantis Manual Entity: TAB-WA1-SQ-1984 -> New Class: fall arrest lanyard
Avantis Manual Entity: THC-ACC-HTR-6025 -> New Class: space heater
Avantis Manual Entity: TAB-WA4-SQ-3430 -> New Class: fall restricting system
Avantis Manual Entity: THC-ACC-PDIT-6291 -> New Class: pressure transmitter
Avantis Manual Entity: THR-PLT-FSL-2222 -> New Class: flow switch
Avantis Manual Entity: TAB-WA1-SQ-1915 -> New Class: fall arrest harness
Avantis Manual Entity: THR-MCS-LD-9868 -> New Class: winch
Avantis Manual Entity: TAB-RSP-FSL-1221D -> New Class: flow switch
Avantis Manual Entity: THR-WA1-LD-0051 -> New Class: synthetic web sling


## Applying the Mapping
The mapping will now be applied, starting with the lest likely to be accurate to the most likely to be accurate

In [35]:
df['Valid_Class'] = None

df['Valid_Class'] = df['Category'].map(dict_supercategory).fillna(df['Valid_Class'])
print(f"After mapping Supercategory, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped")
df['Valid_Class'] = df['Category'].map(dict_category).fillna(df['Valid_Class'])
print(f"After mapping Category, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Category'].map(dict_category).notna().sum()/df.shape[0]:.1%} changed")

df['Valid_Class'] = df['Class'].map(dict_superclass).fillna(df['Valid_Class'])
print(f"After mapping SuperClass, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Class'].map(dict_superclass).notna().sum()/df.shape[0]:.1%} changed")
df['Valid_Class'] = df['Class'].map(dict_class).fillna(df['Valid_Class'])
print(f"After mapping Class, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Class'].map(dict_class).notna().sum()/df.shape[0]:.1%} changed")
df['Valid_Class'] = df['Entity_number'].map(dict_manual).fillna(df['Valid_Class'])

df['Valid_Class'] = df['Extracted_Code'].map(dict_supertag).fillna(df['Valid_Class'])
print(f"After mapping Supertag, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Extracted_Code'].map(dict_supertag).notna().sum()/df.shape[0]:.1%} changed")
df['Valid_Class'] = df['Extracted_Code'].map(dict_tag).fillna(df['Valid_Class'])
print(f"After mapping Tag, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Extracted_Code'].map(dict_tag).notna().sum()/df.shape[0]:.1%} changed")

print(f"After mapping Manual Mapping, {df['Valid_Class'].notna().sum()/df.shape[0]:.1%} mapped, {df['Entity_number'].map(dict_manual).notna().sum()/df.shape[0]:.1%} changed")

df.head(10)

After mapping Supercategory, 32.5% mapped
After mapping Category, 64.0% mapped, 31.6% changed
After mapping SuperClass, 66.0% mapped, 3.1% changed
After mapping Class, 90.8% mapped, 79.1% changed
After mapping Supertag, 99.9% mapped, 13.0% changed
After mapping Tag, 100.0% mapped, 64.9% changed
After mapping Manual Mapping, 100.0% mapped, 26.9% changed


Unnamed: 0,Entity_number,Description,Parent,Parent_Description,Category,Class,Suspended,Extracted_Code,Matched_Code,Valid_Class
0,FCL_ELS_CBL_001L,"Electrical Power Line,4.16KV,From BUS-00B1-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
1,FCL_ELS_CBL_002D,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
2,FCL_ELS_CBL_002F,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
3,FCL_ELS_CBL_002H,"Electrical Power Line,4.16KV,From BUS-00B1 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
4,FCL_ELS_CBL_002L,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
5,FCL_ELS_CBL_00B2-A,"Electrical Power Line,4.16KV,From BUS-00B2 to ...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
6,FCL_ELS_CBL_0100,"Electrical Power Line,From BUS-00L1 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,,,,cable segment
7,FCL_ELS_CBL_0200,"Electrical Power Line,From BUS-00L2 to Transfo...",FCL_ELS_27.6KV_LINES,"Electrical Power Line,27.6KV",Air Handling Unit,Electrical Power Line,,,,cable segment
8,FCL_ELS_CBL_02CF,"Electrical Power Line,4.16KV,From BUS-00B2-A t...",FCL_ELS_4.16KV_LINES,"Electrical Power Line,4.16KV",Air Handling Unit,Electrical Power Line,,,,cable segment
9,17513,"Piping, New Wet Well",TAB-DEW-WEL,Centrate Wet Wells,Piping,Piping,,,,pipe segment


In [36]:
df[df['Valid_Class'].isna()].to_csv('Unmapped_Avantis_Entities.csv', index=False)

In [37]:
df[df['Valid_Class'].isna()]

Unnamed: 0,Entity_number,Description,Parent,Parent_Description,Category,Class,Suspended,Extracted_Code,Matched_Code,Valid_Class
10,17800,TAB Power Distribution Projects,52349,TAB Power Distribution,,,,,,
20,21381,TAB Process Control Projects,52349,TAB Power Distribution,,,,,,
22,32752,TAB Process Computers,52349,TAB Power Distribution,,,,,,
53,8831,"All Buildings, Wet Wells WA8",TAB-WA8,Common to WA8 Bldg. Services,Tank,Tank,,,,
68,36233,"TAB Power Failure August 14, 2003",52349,TAB Power Distribution,,,,,,
