In [9]:
import pandas as pd
import networkx as nx
issues = []
df = pd.read_csv("data/onto_x.csv", names=["Class ID", "Preferred Label", "Parents"], header=0)
print("Dataset Preview")
print(df.head())

Dataset Preview
                                Class ID          Preferred Label  \
0          http://entity/CST/HYPOCHLOREM            HYPOCHLOREMIA   
1      http://entity/CST/EXTRAPYR%20SYND  EXTRAPYRAMIDAL SYNDROME   
2  http://entity/CST/VASCULITIS%20KIDNEY        KIDNEY VASCULITIS   
3            http://entity/CST/SKIN/DERM               Dermatoses   
4       http://entity/CST/FIBRO%20KIDNEY          KIDNEY FIBROSIS   

                                             Parents  
0  http://entity/CST/METGEN|http://entity/CST/MAN...  
1                          http://entity/CST/NERMOVE  
2  http://entity/CST/PATHCOLLAGEN|http://entity/C...  
3                                                NaN  
4  http://entity/CST/UG/UT/K/M|http://entity/CST/...  


In [13]:
#Empty Class IDs
empty_ids = df[df["Class ID"].isna() | (df["Class ID"].str.strip() == "")]
print("Empty Class IDs:")
print(empty_ids.to_string(index=False))

Empty Class IDs:
Empty DataFrame
Columns: [Class ID, Preferred Label, Parents]
Index: []


In [14]:
#Empty Labels
empty_labels = df[df["Preferred Label"].isna() | (df["Preferred Label"].str.strip() == "")]
print("Empty Labels:")
print(empty_labels.to_string(index=False))

Empty Labels:
Empty DataFrame
Columns: [Class ID, Preferred Label, Parents]
Index: []


In [11]:
no_parents = df[df['Parents'].isna() | (df['Parents'] == '')]
print("Entities without parents:")
print(no_parents[['Class ID', 'Preferred Label']])


Entities without parents:
                              Class ID  \
3          http://entity/CST/SKIN/DERM   
24         http://entity/CST/SKIN/SBGL   
102              http://entity/CST/IOS   
103    http://entity/CST/STENO%20ESOPH   
105        http://entity/CST/SKIN/SUBQ   
108        http://entity/CST/SKIN/HAIR   
190              http://entity/CST/FBC   
236              http://entity/CST/FET   
283              http://entity/CST/FDU   
344    http://entity/CST/ENDO/PIT/POST   
362              http://entity/CST/TTT   
458              http://entity/CST/OOT   
463              http://entity/CST/MTD   
478         http://entity/CST/SKIN/PIG   
572              http://entity/CST/TTO   
580     http://entity/CST/ENDO/PIT/ANT   
614              http://entity/CST/NAI   
750              http://entity/CST/OTT   
759              http://entity/CST/IBM   
825             http://entity/STY/T071   
868              http://entity/CST/TOT   
871        http://entity/CST/SKIN/NAIL   
874     

In [16]:
graph = {row["Class ID"]: [p.strip() for p in str(row["Parents"]).split("|") if p.strip()] 
         for _, row in df.iterrows()}

all_ids = set(df["Class ID"])

# No Parents
missing_refs = []
for cid, parents in graph.items():
    for parent in parents:
        if parent not in all_ids:
            missing_refs.append((cid, parent))

print("Missing Parent References:")
for cid, parent in missing_refs:
    print(f"- {cid} -> {parent}")


Missing Parent References:
- http://entity/CST/SKIN/DERM -> nan
- http://entity/CST/SKIN/SBGL -> nan
- http://entity/CST/MS -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/IOS -> nan
- http://entity/CST/STENO%20ESOPH -> nan
- http://entity/CST/SKIN/SUBQ -> nan
- http://entity/CST/SKIN/HAIR -> nan
- http://entity/CST/DIG -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/FBC -> nan
- http://entity/CST/NER -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/MAN -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/FET -> nan
- http://entity/CST/UG -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/FDU -> nan
- http://entity/CST/ENDO/PIT/POST -> nan
- http://entity/CST/TTT -> nan
- http://entity/CST/ENDO -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/OOT -> nan
- http://entity/CST/GU -> http://www.w3.org/2002/07/owl#Thing
- http://entity/CST/MTD -> nan
- http://entity/CST/SKIN/PIG -> nan
- http://entity/CST/ANS -> http://www.w3.org/2002/0

In [18]:
import networkx as nx
import pandas as pd

# Build graph safely
G = nx.DiGraph()
for _, row in df.iterrows():
    child = row['Class ID'].strip()
    parents_raw = str(row['Parents']).strip()
    if parents_raw and parents_raw.lower() != 'nan':  
        # Split by '|', strip each parent
        parents = [p.strip() for p in parents_raw.split('|') if p.strip()]
        for parent in parents:
            G.add_edge(child, parent)

# Detect cycles
cycles = list(nx.simple_cycles(G))
if cycles:
    print("Cycles detected (potential infinite loop):")
    print(cycles)
else:
    print("No cycles detected, DFS safe to use.")


Cycles detected (potential infinite loop):
[['http://entity/CST/HEMHMRG', 'http://entity/CST/HEM']]


In [20]:
# Cycles detection

def detect_cycles(graph):
    visited, stack = set(), set()
    cycles = []

    def dfs(node, path):
        if node in stack:
            cycles.append(path + [node])  # cycle found
            return
        if node in visited:
            return
        visited.add(node)
        stack.add(node)
        for p in graph.get(node, []):
            if p in graph:  # only traverse valid parents
                dfs(p, path + [node])
        stack.remove(node)

    for n in graph:
        dfs(n, [])
    return cycles

cycles = detect_cycles(graph)

print("🔄 Cycles found:")
if not cycles:
    print("✅ No cycles detected")
else:
    for cycle in cycles:
        print("\nCycle detected:")
        for cid in cycle:
            row = df[df["Class ID"] == cid].iloc[0]
            print(f"- {cid} | {row['Preferred Label']}")


🔄 Cycles found:

Cycle detected:
- http://entity/CST/LEUKOCYTE%20VACUOL | LEUKOCYTE VACUOLIZATION
- http://entity/CST/HEMWBCABN | WBC ABNORMALITY
- http://entity/CST/HEM | HEMORRHAGE
- http://entity/CST/HEMHMRG | HEMORRHAGE
- http://entity/CST/HEM | HEMORRHAGE
