In [1]:
import pandas as pd
import sbol2
import os
from rdflib import Graph
from torch_geometric.data import HeteroData
import pandas as pd
import numpy as np
import torch
from torch import Tensor
import subprocess
import tempfile
from rdflib.query import ResultRow



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')
attachments_path = os.path.join(current_dir, '..', 'attachments')
pulled_attachments_path = os.path.join(current_dir, '..', 'pulled_attachments')
sbol_path = os.path.join(current_dir, '..', 'sbol_data')
downloaded_sbol_path = os.path.join(current_dir, '..', 'downloaded_sbol')
original_data_path = os.path.join(data_path, 'original_data')
nt_path = os.path.join(current_dir, '..', 'nt_data')
scripts_path = os.path.join(current_dir, 'scripts')
model_data_path = os.path.join(data_path, 'processed_data', 'replicated_models')
model_output_path = os.path.join('..', 'model_outputs')

In [3]:
def xml_to_nt(xml_path, file_name):
    g = Graph()
    g.parse(os.path.join(xml_path, file_name), format="xml")
    g.serialize(destination=os.path.join(nt_path, file_name.replace(".xml", ".nt")), format="nt")

    sparql_query ='''PREFIX om: <http://www.ontology-of-units-of-measure.org/resource/om-2/>

    SELECT ?numericalValue
    WHERE {
    ?s om:hasNumericalValue ?numericalValue .
    }
    '''
    query_result = g.query(sparql_query)

    # Process the results
    if query_result:
        for row in query_result:
            if isinstance(row, ResultRow):
                return float(row.numericalValue) 
                
            else:
                print(row)
    else:
        print("No numerical values found.")
    

y_measures = []
for file_name in os.listdir(os.path.join(sbol_path)):
    y = xml_to_nt(sbol_path, file_name)
    y_measures.append(y)

print(y_measures)



[1.47612675105494, 0.829006829560532, 0.83686023535819, 4.98010165116471, 1.29549588563797, 0.606791920287013, 4.03745286813019, 0.839345051489129, 0.703489414322014, 1.33853534246064, 0.952353993376417, 0.721923972925238, 1.71154051666382, 0.978280359558387, 0.650949515285976, 1.08353101257602, 0.791538491672311, 0.86321231687193, 0.899845153281078, 0.921165774701605, 0.760024369424846, 0.832231632768715, 0.877346628211269, 0.577606497166208, 2.25573904406097, 0.843423457053308, 0.987119590825091, 0.652969325871094, 0.630607951874758, 1.26239993788532, 0.941177138344474, 0.649123761624653, 0.741453154604123, 0.83305465019915, 0.630535417964329, 0.641943138319258, 0.848224896719565, 1.02364330973576, 0.689367982136644, 0.661170401192899, 0.852934413854728, 0.990890318135298, 0.696888912490577, 0.626320235391157, 0.762131601769412, 0.831328281828409, 0.622580118971386, 0.842409455129878, 0.872914302192873, 0.797764510000215, 1.55465096458842, 0.862220731915124, 0.696611419723164, 0.7080

In [4]:
node_classes = [
    "ComponentDefinition",
    "Sequence",
    "ModuleDefinition",
    "Module",
    "FunctionalComponent",
    "Component",
    "SequenceAnnotation",
    "Range"
]

all_edges_formatted = [
    "ComponentDefinition_Sequence",
    "ComponentDefinition_SequenceAnnotation",
    "ComponentDefinition_Range",
    "ModuleDefinition_ComponentDefinition",
    "ModuleDefinition_ModuleDefinition",
    "ComponentDefinition_ComponentDefinition",
]


In [5]:
def return_heterograph_for_one_nt(nt_file_name, node_names, edge_names):
    
    with tempfile.TemporaryDirectory() as temp_dir:
        save_path_numeric = os.path.join(temp_dir, "save_path_numeric")
        path = os.path.join(temp_dir, "path")
        os.makedirs(save_path_numeric, exist_ok=True)
        os.makedirs(path, exist_ok=True)

        config_string = f'''
        [InputPath]
        input_path = {os.path.join(nt_path, nt_file_name)}

        [SavePath]
        save_path_numeric_graph = {save_path_numeric}
        save_path_mapping = {path}

        [NLD]
        nld_class = ModuleDefinition

        [EMBEDDING]
        embedding_model = allenai/scibert_scivocab_uncased

        [Nodes]
        classes = ComponentDefinition, Sequence, ModuleDefinition, Module, FunctionalComponent, Component, SequenceAnnotation, Range

        ComponentDefinition = http://sbols.org/v2#ComponentDefinition
        Sequence = http://sbols.org/v2#Sequence
        ModuleDefinition = http://sbols.org/v2#ModuleDefinition
        Module = http://sbols.org/v2#Module
        FunctionalComponent = http://sbols.org/v2#FunctionalComponent
        Component = http://sbols.org/v2#Component
        SequenceAnnotation = http://sbols.org/v2#SequenceAnnotation
        Range = http://sbols.org/v2#Range

        [SimpleEdges]
        edge_names = ComponentDefinition_Sequence, ComponentDefinition_SequenceAnnotation
        ComponentDefinition_Sequence_start_node = ComponentDefinition
        ComponentDefinition_Sequence_properties = http://sbols.org/v2#sequence
        ComponentDefinition_Sequence_end_node = Sequence
        ComponentDefinition_SequenceAnnotation_start_node = ComponentDefinition
        ComponentDefinition_SequenceAnnotation_properties = http://sbols.org/v2#sequenceAnnotation
        ComponentDefinition_SequenceAnnotation_end_node = SequenceAnnotation

        [N-HopEdges]
        edge_names = ComponentDefinition_Range, ModuleDefinition_ComponentDefinition, ModuleDefinition_ModuleDefinition, ComponentDefinition_ComponentDefinition
        ComponentDefinition_Range_start_node = ComponentDefinition
        ComponentDefinition_Range_hop1_properties = http://sbols.org/v2#sequenceAnnotation
        ComponentDefinition_Range_hop2_properties = http://sbols.org/v2#location
        ComponentDefinition_Range_end_node = Range
        ModuleDefinition_ComponentDefinition_start_node = ModuleDefinition
        ModuleDefinition_ComponentDefinition_hop1_properties = http://sbols.org/v2#functionalComponent
        ModuleDefinition_ComponentDefinition_hop2_properties = http://sbols.org/v2#definition
        ModuleDefinition_ComponentDefinition_end_node = ComponentDefinition
        ModuleDefinition_ModuleDefinition_start_node = ModuleDefinition
        ModuleDefinition_ModuleDefinition_hop1_properties = http://sbols.org/v2#module
        ModuleDefinition_ModuleDefinition_hop2_properties = http://sbols.org/v2#definition
        ModuleDefinition_ModuleDefinition_end_node = ModuleDefinition
        ComponentDefinition_ComponentDefinition_start_node = ComponentDefinition
        ComponentDefinition_ComponentDefinition_hop1_properties = http://sbols.org/v2#component
        ComponentDefinition_ComponentDefinition_hop2_properties = http://sbols.org/v2#definition
        ComponentDefinition_ComponentDefinition_end_node = ComponentDefinition

        [N-ArayEdges]
        edge_names = ComponentDefinition_Range
        ComponentDefinition_Range_start_node = ComponentDefinition
        ComponentDefinition_Range_properties = http://sbols.org/v2#sequenceAnnotation, http://sbols.org/v2#location
        ComponentDefinition_Range_end_node = Range

        [N-ArayFeaturePath]
        ComponentDefinition_Range_feature_path = http://sbols.org/v2#sequenceAnnotation, http://sbols.org/v2#location

        [N-ArayFeatureValue]
        ComponentDefinition_Range_feature_value = http://sbols.org/v2#start, http://sbols.org/v2#end
        '''

        with open(os.path.join(temp_dir,'config.ini'), 'w') as file:
            file.write(config_string)


        result = subprocess.run(["python", "autordf2gml.py", "--config_path", os.path.join(temp_dir,"config.ini")], shell=True, capture_output=True, text=True)

        data = HeteroData()
        local_indices_map = {}



        for filename in os.listdir(os.path.join(temp_dir, 'save_path_numeric')):
            print(filename)

        for node_name in node_names:
            node_features_df = pd.read_csv(os.path.join(save_path_numeric, f'pivoted_df_{node_name}.csv'), header=None).astype(float)
            node_tensor = torch.tensor(node_features_df.values, dtype=torch.float)
            id_mapping_df = pd.read_csv(os.path.join(path, f'pivoted_df_{node_name}.csv'))
            subject_mapping_dict = id_mapping_df.set_index('subject')['mapping'].to_dict()
            data[node_name].node_id = torch.arange(len(id_mapping_df))
            data[node_name].x = node_tensor
            local_indices_map = subject_mapping_dict | local_indices_map
                
        for edge in edge_names:
            df = pd.read_csv( os.path.join(save_path_numeric, f"edge_list_{edge}.csv"), header=None)
            print(edge)
            
            src = df[0].values
            dst = df[1].values

            src = torch.tensor([local_indices_map[src[i]] for i in range(len(src))], dtype=torch.long)
            dst = torch.tensor([local_indices_map[dst[i]] for i in range(len(dst))], dtype=torch.long)
            print(dst)
            data[edge.split("_")[0], f'has_{edge.split("_")[1]}', edge.split("_")[1]].edge_index = torch.stack([src, dst], dim=0)
        
        return data

In [6]:
all_data = []
for filename in os.listdir(nt_path):
    data = return_heterograph_for_one_nt(filename, node_classes, all_edges_formatted)
    all_data.append(data)

edge_list_ComponentDefinition_ComponentDefinition.csv
edge_list_ComponentDefinition_Range.csv
edge_list_ComponentDefinition_Sequence.csv
edge_list_ComponentDefinition_SequenceAnnotation.csv
edge_list_ModuleDefinition_ComponentDefinition.csv
edge_list_ModuleDefinition_ModuleDefinition.csv
pivoted_df_Component.csv
pivoted_df_ComponentDefinition.csv
pivoted_df_FunctionalComponent.csv
pivoted_df_Module.csv
pivoted_df_ModuleDefinition.csv
pivoted_df_Range.csv
pivoted_df_Sequence.csv
pivoted_df_SequenceAnnotation.csv
ComponentDefinition_Sequence
tensor([0])
ComponentDefinition_SequenceAnnotation
tensor([0])
ComponentDefinition_Range
tensor([0])
ModuleDefinition_ComponentDefinition
tensor([1, 0])
ModuleDefinition_ModuleDefinition
tensor([1, 3, 0])
ComponentDefinition_ComponentDefinition
tensor([2])
edge_list_ComponentDefinition_ComponentDefinition.csv
edge_list_ComponentDefinition_Range.csv
edge_list_ComponentDefinition_Sequence.csv
edge_list_ComponentDefinition_SequenceAnnotation.csv
edge_li

In [None]:
all_data

[HeteroData(
   [1mComponentDefinition[0m={
     node_id=[3],
     x=[3, 128]
   },
   [1mSequence[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mModuleDefinition[0m={
     node_id=[4],
     x=[4, 128]
   },
   [1mModule[0m={
     node_id=[3],
     x=[3, 128]
   },
   [1mFunctionalComponent[0m={
     node_id=[2],
     x=[2, 128]
   },
   [1mComponent[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mSequenceAnnotation[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mRange[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1m(ComponentDefinition, has_Sequence, Sequence)[0m={ edge_index=[2, 1] },
   [1m(ComponentDefinition, has_SequenceAnnotation, SequenceAnnotation)[0m={ edge_index=[2, 1] },
   [1m(ComponentDefinition, has_Range, Range)[0m={ edge_index=[2, 1] },
   [1m(ModuleDefinition, has_ComponentDefinition, ComponentDefinition)[0m={ edge_index=[2, 2] },
   [1m(ModuleDefinition, has_ModuleDefinition, ModuleDefinition)[0m={ edge_index=[2, 3] },
   [1m(C

In [77]:
from sklearn.preprocessing import StandardScaler
all_y_values_np = np.array(y_measures).reshape(-1, 1)
scaler_y = StandardScaler()
scaler_y.fit(all_y_values_np)
y_measures

for i, data in enumerate(all_data):
    data.y = torch.tensor([y_measures[i]], dtype=torch.float)

In [68]:
all_data

[HeteroData(
   y=[1],
   [1mComponentDefinition[0m={
     node_id=[3],
     x=[3, 128]
   },
   [1mSequence[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mModuleDefinition[0m={
     node_id=[4],
     x=[4, 128]
   },
   [1mModule[0m={
     node_id=[3],
     x=[3, 128]
   },
   [1mFunctionalComponent[0m={
     node_id=[2],
     x=[2, 128]
   },
   [1mComponent[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mSequenceAnnotation[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1mRange[0m={
     node_id=[1],
     x=[1, 128]
   },
   [1m(ComponentDefinition, has_Sequence, Sequence)[0m={ edge_index=[2, 1] },
   [1m(ComponentDefinition, has_SequenceAnnotation, SequenceAnnotation)[0m={ edge_index=[2, 1] },
   [1m(ComponentDefinition, has_Range, Range)[0m={ edge_index=[2, 1] },
   [1m(ModuleDefinition, has_ComponentDefinition, ComponentDefinition)[0m={ edge_index=[2, 2] },
   [1m(ModuleDefinition, has_ModuleDefinition, ModuleDefinition)[0m={ edge_index=[2, 3] },

In [78]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, Linear, SAGEConv, GCNConv, GATConv, global_mean_pool

class HeteroGNN_GraphLevel(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, num_layers):
        super().__init__()
        self.metadata = metadata  # (node_types, edge_types)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                ('ComponentDefinition', 'has_Sequence', 'Sequence'):SAGEConv((-1, -1), hidden_channels),
                ('ComponentDefinition', 'has_SequenceAnnotation', 'SequenceAnnotation'): SAGEConv((-1, -1), hidden_channels),
                ('ModuleDefinition', 'has_ComponentDefinition', 'ComponentDefinition'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
                ('ModuleDefinition', 'has_ModuleDefinition', 'ModuleDefinition'): GCNConv(-1, hidden_channels),
                ('ComponentDefinition', 'has_ComponentDefinition', 'ComponentDefinition'): GCNConv(-1, hidden_channels),
                ('ComponentDefinition', 'has_Range', 'Range'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            }, aggr='sum')
            self.convs.append(conv)

        # Final regression MLP
        # total_hidden = hidden_channels * len(metadata[0])  
        total_hidden = 320
        self.lin = torch.nn.Sequential(
            Linear(total_hidden, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            Linear(hidden_channels, 1) 
        )

    def forward(self, x_dict, edge_index_dict, batch_dict):

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {k: F.relu(v) for k, v in x_dict.items()}


        # Pool all node types, then concatenate
        pooled = [
            global_mean_pool(x_dict[node_type], batch_dict[node_type])
            for node_type in x_dict
        ]        
        graph_embeddings = torch.cat(pooled, dim=-1)  # shape: [batch_size, total_hidden]
        return self.lin(graph_embeddings).view(-1)     # shape: [batch_size]


[1.47612675105494,
 0.829006829560532,
 0.83686023535819,
 4.98010165116471,
 1.29549588563797,
 0.606791920287013,
 4.03745286813019,
 0.839345051489129,
 0.703489414322014,
 1.33853534246064,
 0.952353993376417,
 0.721923972925238,
 1.71154051666382,
 0.978280359558387,
 0.650949515285976,
 1.08353101257602,
 0.791538491672311,
 0.86321231687193,
 0.899845153281078,
 0.921165774701605,
 0.760024369424846,
 0.832231632768715,
 0.877346628211269,
 0.577606497166208,
 2.25573904406097,
 0.843423457053308,
 0.987119590825091,
 0.652969325871094,
 0.630607951874758,
 1.26239993788532,
 0.941177138344474,
 0.649123761624653,
 0.741453154604123,
 0.83305465019915,
 0.630535417964329,
 0.641943138319258,
 0.848224896719565,
 1.02364330973576,
 0.689367982136644,
 0.661170401192899,
 0.852934413854728,
 0.990890318135298,
 0.696888912490577,
 0.626320235391157,
 0.762131601769412,
 0.831328281828409,
 0.622580118971386,
 0.842409455129878,
 0.872914302192873,
 0.797764510000215,
 1.5546509645

In [79]:
from torch_geometric.loader import DataLoader

loader = DataLoader(all_data, batch_size=10, shuffle=True)
model = HeteroGNN_GraphLevel(metadata=all_data[0].metadata(), hidden_channels=64, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
loss_fn = torch.nn.MSELoss()  # For regression

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(1, 100):
    model.train()
    total_loss = 0

    for batch in loader:
        batch = batch.to(device)  # if using CUDA
        optimizer.zero_grad()

        out = model(batch.x_dict, batch.edge_index_dict, batch.batch_dict)
        target = batch.y.view(-1)
        loss = F.mse_loss(out, target)
        loss.backward()
        optimizer.step() 

        total_loss += loss.item() * batch.num_graphs

    print(f"Epoch {epoch:03d} | Train Loss: {total_loss / len(loader.dataset):.4f}")




Epoch 001 | Train Loss: 1.3233
Epoch 002 | Train Loss: 1.1156
Epoch 003 | Train Loss: 0.8614
Epoch 004 | Train Loss: 0.7171
Epoch 005 | Train Loss: 0.7806
Epoch 006 | Train Loss: 0.6996
Epoch 007 | Train Loss: 0.7479
Epoch 008 | Train Loss: 0.6480
Epoch 009 | Train Loss: 0.6611
Epoch 010 | Train Loss: 0.6936
Epoch 011 | Train Loss: 0.5270
Epoch 012 | Train Loss: 0.6332
Epoch 013 | Train Loss: 0.5432
Epoch 014 | Train Loss: 0.4797
Epoch 015 | Train Loss: 0.5168
Epoch 016 | Train Loss: 0.5267
Epoch 017 | Train Loss: 0.3859
Epoch 018 | Train Loss: 0.3879
Epoch 019 | Train Loss: 0.4463
Epoch 020 | Train Loss: 0.3327
Epoch 021 | Train Loss: 0.3751
Epoch 022 | Train Loss: 0.4307
Epoch 023 | Train Loss: 0.2717
Epoch 024 | Train Loss: 0.3268
Epoch 025 | Train Loss: 0.3669
Epoch 026 | Train Loss: 0.3611
Epoch 027 | Train Loss: 0.2674
Epoch 028 | Train Loss: 0.2336
Epoch 029 | Train Loss: 0.2312
Epoch 030 | Train Loss: 0.2365
Epoch 031 | Train Loss: 0.2861
Epoch 032 | Train Loss: 0.1222
Epoch 03

In [19]:
all_data[0].metadata()[0]

['ComponentDefinition',
 'Sequence',
 'ModuleDefinition',
 'Module',
 'FunctionalComponent',
 'Component',
 'SequenceAnnotation',
 'Range']