In [1]:
import pandas as pd
from rdflib import Graph
from itertools import combinations

In [None]:
def getOntology_pairs(path_to_ont):
    g = Graph()
    g.parse(path_to_ont, format="ttl")

    query = f"""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT DISTINCT ?Class ?Subclass ?sbLabel ?clsLabel
    WHERE {{
        ?Subclass rdfs:subClassOf ?Class .
            ?Class rdfs:label ?clsLabel .
            ?Subclass rdfs:label ?sbLabel .
            FILTER (!isBlank(?Class)) 
    }}

    """

    qres = g.query(query)

    df = pd.DataFrame(qres.bindings)
    df.columns = df.columns.str.replace(' ', '')
    df = df[['clsLabel', 'sbLabel']].copy()
    
    return df

In [None]:
# label = 0, meaning for data (A,B) A is NOT_DIRECTLY_RELATED to B.
def create_negativeDF(data_df, superclass_df, subclass_df):
    negative_df = data_df.copy()
    negative_df['clsLabel'] = negative_df['clsLabel'].values[::-1] 

    negative_df[~negative_df.isin(superclass_df)].dropna()
    negative_df[~negative_df.isin(subclass_df)].dropna()

    negative_df = negative_df.rename(columns={'clsLabel' : 'classA', 'sbLabel' : 'classB'})
    negative_df["label"] = 0
    return negative_df

# label = 1, meaning for data (A,B) A is SUPERCLASS of B.
def create_superclassDF(data_df):
    superclass_df = data_df.copy().rename(columns={'clsLabel' : 'classA', 'sbLabel' : 'classB'})
    superclass_df["label"] = 1
    return superclass_df

# label = 2, meaning for data (A,B) A is SUBCLASS of B.
def create_subclassDF(data_df):
    subclass_df = data_df.copy().rename(columns={'clsLabel' : 'classB', 'sbLabel' : 'classA'})
    subclass_df["label"] = 2
    return subclass_df

In [None]:
def create_012DF(data_df):
    subclass_df = create_subclassDF(data_df)
    superclass_df = create_superclassDF(data_df)
    negative_df = create_negativeDF(data_df, superclass_df, subclass_df)
    processed_df = pd.concat([negative_df, superclass_df, subclass_df], axis=0)
    
    return processed_df

In [None]:
def getOntology_subclasses(path_to_ont):
  g = Graph()
  g.parse(path_to_ont, format="ttl")

  query = f"""
  SELECT ?clsLabel (GROUP_CONCAT(?sbLabel; separator="|") AS ?sbLabel)
  WHERE {{
      ?subclass rdfs:subClassOf ?superclass .
      
      ?subclass rdfs:label ?sbLabel .
      ?superclass rdfs:label ?clsLabel . 
  }}
  GROUP BY ?clsLabel
  HAVING (COUNT(?subclass) > 1)  # Only show superclasses with multiple subclasses

  """

  qres = g.query(query)

  df = pd.DataFrame(qres.bindings)
  df.columns = df.columns.str.replace(' ', '')
  df = df[['clsLabel', 'sbLabel']].copy()
  
  return df

In [None]:
def generate_same_combinations(row):
    elements = row.split("|")  
    return [pair for pair in combinations(elements, 2)]  

In [1]:
# label = 3, meaning for data (A,B) A is FROM_THE_SAME_SUPERCLASS as B.
def create_sameDF(data_df_label3, df_length):
    df = data_df_label3["sbLabel"].apply(generate_same_combinations)
    df = df.to_frame(name="combinations").explode("combinations")
    df[["classA", "classB"]] = pd.DataFrame(df["combinations"].tolist(), index=df.index)
    df = df.drop(columns=["combinations"])
    df["label"] = 3

    df = df.sample(frac=1).reset_index(drop=True)
    df = df[:int(df_length/2)]
    df_reversed  = df.rename(columns={"classA": "classB", "classB": "classA"})

    sameclass_df = pd.concat([df, df_reversed], ignore_index=True)
    return sameclass_df

In [2]:
def create_trainingDF(path_to_ont):
    data_df = getOntology_pairs(path_to_ont)
    data_df_label3 = getOntology_subclasses(path_to_ont)

    superclass_df = create_012DF(data_df)
    sameclass_df = create_sameDF(data_df_label3, int(len(superclass_df.index)/3)) # divide by 3 to match length of each labels

    training_df = pd.concat([superclass_df, sameclass_df], axis=0)
    training_df = training_df.sample(frac=1).reset_index(drop=True)

    return training_df