In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_duo_ontology(file_path):
    # DEFINE THE NAMESPACES USED IN THE RDF/XML FILE
    ns = {
        'owl': 'http://www.w3.org/2002/07/owl#',
        'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
        'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
        'obo': 'http://purl.obolibrary.org/obo/',
        'oboInOwl': 'http://www.geneontology.org/formats/oboInOwl#'
    }

    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
    except Exception as e:
        return f"ERROR PARSING FILE: {e}"

    ontology_data = []

    # SEARCH FOR BOTH CLASSES AND OBJECT PROPERTIES
    tags_to_find = [
        ('owl:Class', 'Class'),
        ('owl:ObjectProperty', 'Property')
    ]

    for tag, category in tags_to_find:
        for item in root.findall(tag, ns):
            # EXTRACT CORE METADATA
            duo_id_elem = item.find('oboInOwl:id', ns)
            label_elem = item.find('rdfs:label', ns)
            def_elem = item.find('obo:IAO_0000115', ns)
            
            # CHECK IF THE TERM IS DEPRECATED
            deprecated = item.find('owl:deprecated', ns)
            if deprecated is not None and deprecated.text.lower() == 'true':
                continue

            # SKIP ANONYMOUS CLASSES (THOSE WITHOUT AN ABOUT ATTRIBUTE OR ID)
            if duo_id_elem is None and label_elem is None:
                continue

            ontology_data.append({
                'ID': duo_id_elem.text if duo_id_elem is not None else 'N/A',
                'TYPE': category,
                'LABEL': label_elem.text if label_elem is not None else 'N/A',
                'DEFINITION': def_elem.text if def_elem is not None else 'NO DEFINITION PROVIDED'
            })

    # CREATE A DATAFRAME AND SORT BY ID
    df = pd.DataFrame(ontology_data)
    df = df.sort_values(by='ID').reset_index(drop=True)
    
    return df

# USAGE
# REPLACE 'duo.owl' WITH THE ACTUAL PATH TO YOUR DOWNLOADED FILE
file_path = '../access.xml'
df_results = parse_duo_ontology(file_path)

# VIEW THE FIRST FEW ROWS
print(df_results.head(20))

# OPTIONAL: SAVE TO A CLEAN EXCEL OR CSV FILE
# df_results.to_csv('duo_table.csv', index=False)

             ID      TYPE                                            LABEL  \
0   DUO:0000001     Class                              data use permission   
1   DUO:0000004     Class                                   no restriction   
2   DUO:0000005     Class  obsolete general research use and clinical care   
3   DUO:0000006     Class         health or medical or biomedical research   
4   DUO:0000007     Class                        disease specific research   
5   DUO:0000010  Property                                 is restricted to   
6   DUO:0000011     Class     population origins or ancestry research only   
7   DUO:0000012     Class                   research specific restrictions   
8   DUO:0000014     Class                       obsolete research use only   
9   DUO:0000015     Class                      no general methods research   
10  DUO:0000016     Class                             genetic studies only   
11  DUO:0000017     Class                                data us

In [None]:
df_results.to

In [4]:
df_results.iloc[:,[2,3]]

Unnamed: 0,LABEL,DEFINITION
0,data use permission,A data item that is used to indicate consent p...
1,no restriction,This data use permission indicates there is no...
2,obsolete general research use and clinical care,This data use limitation indicates that use is...
3,health or medical or biomedical research,This data use permission indicates that use is...
4,disease specific research,This data use permission indicates that use is...
5,is restricted to,A is restricted to B iff A is a consent code a...
6,population origins or ancestry research only,This data use permission indicates that use of...
7,research specific restrictions,This data use modifier indicates that use is l...
8,obsolete research use only,This data use limitation indicates that use is...
9,no general methods research,This data use modifier indicates that use does...


In [15]:
list(map(list,df_results.loc[df_results.DEFINITION.map(lambda x: "modifier" in x)].iloc[:,[2,3]].values))

[['research specific restrictions',
  'This data use modifier indicates that use is limited to studies of a certain research type.'],
 ['no general methods research',
  'This data use modifier indicates that use does not allow methods development research (e.g., development of software or algorithms).'],
 ['genetic studies only',
  'This data use modifier indicates that use is limited to genetic studies only (i.e., studies that include genotype research alone or both genotype and phenotype research, but not phenotype research exclusively)'],
 ['data use modifier',
  'Data use modifiers indicate additional conditions for use.'],
 ['not for profit, non commercial use only',
  'This data use modifier indicates that use of the data is limited to not-for-profit organizations and not-for-profit use, non-commercial use.'],
 ['publication required',
  'This data use modifier indicates that requestor agrees to make results of studies using the data available to the larger scientific community.'