In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
def parse_attributes(attributes):
    
    # create dictionary for attribute value
    attribute_data = {}
    
    for attribute in attributes.findall('Attribute'):
        attribute_name = attribute.get('attribute_name')
        attribute_value = attribute.text
        attribute_data[attribute_name] = attribute_value
    return attribute_data

In [3]:
# define function for child node 
def parse_biosample(biosample):
    
    # create dictionary for biosample value
    biosample_data = {}
    
    biosample_data['ID'] = biosample.get('accession')
    biosample_data['Title'] = biosample.find('Description/Title').text
    biosample_data['OrganismName'] = biosample.find('Description/Organism/OrganismName').text
    owner = biosample.find('Owner')
    biosample_data['OwnerName'] = owner.find('Name').text
    biosample_data['OwnerEmail'] = owner.find('Contacts/Contact').get('email')
    biosample_data['Model'] = biosample.find('Models/Model').text
    biosample_data['Package'] = biosample.find('Package').get('display_name')
    status = biosample.find('Status')
    biosample_data['Status'] = status.get('status') if status is not None else None
    attributes = biosample.find('Attributes')
    
    
    if attributes is not None:
        attribute_data = parse_attributes(attributes)
        biosample_data.update(attribute_data)   #using .update to add data 
    return biosample_data

In [4]:


def xml_to_dataframe(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    
    # create empty list to append
    biosample_data_list = []
    
    # similar to for i in root.findall()
    for biosample in root.findall('BioSample'):
        biosample_data = parse_biosample(biosample)
        biosample_data_list.append(biosample_data)
    df = pd.DataFrame(biosample_data_list)
    return df

In [5]:
xml_file_path = '/home/supakorn/biosample_result.xml'
df = xml_to_dataframe(xml_file_path)
print(df)

               ID                        Title  OrganismName        OwnerName  \
0    SAMN19715322   pPDsHSrSNxi3322d200429PosH  Homo sapiens  Broad Institute   
1    SAMN19715320   pPDsHSrSNxi2544d200429PosA  Homo sapiens  Broad Institute   
2    SAMN19715318   pPDsHSrSNxi2544d200429PosC  Homo sapiens  Broad Institute   
3    SAMN19715316  pPDsHSrSNxi3322d200429DAPIC  Homo sapiens  Broad Institute   
4    SAMN19715315   pPDsHSrSNxi2569d200429PosA  Homo sapiens  Broad Institute   
..            ...                          ...           ...              ...   
96   SAMN19715165  pPDsHSrSNxi3482d200429DAPIB  Homo sapiens  Broad Institute   
97   SAMN19715164   pPDsHSrSNxi4956d200429PosA  Homo sapiens  Broad Institute   
98   SAMN19715163  pPDsHSrSNxi3482d200429DAPIA  Homo sapiens  Broad Institute   
99   SAMN19715162   pPDsHSrSNxi3322d200429PosF  Homo sapiens  Broad Institute   
100  SAMN19715161  pPDsHSrSNxi6173d200429DAPIA  Homo sapiens  Broad Institute   

                      Owner

In [6]:
df

Unnamed: 0,ID,Title,OrganismName,OwnerName,OwnerEmail,Model,Package,Status,source_name,tissue,Sex,age,pmi,disease,status,Cause of death,facs classification,10x version,genome build
0,SAMN19715322,pPDsHSrSNxi3322d200429PosH,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,90,22,Ctrl,Ctrl,Colon cancer,Positive,V3,hg19
1,SAMN19715320,pPDsHSrSNxi2544d200429PosA,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,89,10,Disease,LBD,Lewy body dementia,Positive,V3,hg19
2,SAMN19715318,pPDsHSrSNxi2544d200429PosC,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,89,10,Disease,LBD,Lewy body dementia,Positive,V3,hg19
3,SAMN19715316,pPDsHSrSNxi3322d200429DAPIC,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,90,22,Ctrl,Ctrl,Colon cancer,Negative,V3,hg19
4,SAMN19715315,pPDsHSrSNxi2569d200429PosA,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,male,74,13,Disease,LBD,Lewy body dementia,Positive,V3,hg19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,SAMN19715165,pPDsHSrSNxi3482d200429DAPIB,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,79,14,Ctrl,Ctrl,Coronary artery disease,Negative,V3,hg19
97,SAMN19715164,pPDsHSrSNxi4956d200429PosA,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,92,23.3,Ctrl,Ctrl,Prostate cancer,Positive,V3,hg19
98,SAMN19715163,pPDsHSrSNxi3482d200429DAPIA,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,79,14,Ctrl,Ctrl,Coronary artery disease,Negative,V3,hg19
99,SAMN19715162,pPDsHSrSNxi3322d200429PosF,Homo sapiens,Broad Institute,emacosko@broadinstitute.org,Generic,Generic,live,Human Substantia nigra pars compacta,Substantia nigra pars compacta,female,90,22,Ctrl,Ctrl,Colon cancer,Positive,V3,hg19
