In [85]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

# Construct empty DataFrame
df_premises = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'type', 'logos', 'pathos', 'ethos'
])

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    return [(elem.text, elem.attrib['type']) for elem in tree.iter() if elem.tag == 'premise']

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, premise_type in parse_xml(path):
                df_premises = pd.concat([df_premises, pd.DataFrame({
                    'document': [path.split('/')[2].split('.')[0]],
                    'file_path': [path],
                    'split': [path.split('/')[1]],
                    'type': [premise_type],
                    'logos': ['logos' in premise_type],
                    'pathos': ['pathos' in premise_type],
                    'ethos': ['ethos' in premise_type],
                })], axis=0, ignore_index=True)
        except ET.ParseError:
            pass

df_premises.to_pickle('./v2.0-processed/df_premises.pickle')

In [86]:
df_premises

Unnamed: 0,document,file_path,split,type,logos,pathos,ethos
0,88,v2.0/positive/88.xml,positive,logos,True,False,False
1,88,v2.0/positive/88.xml,positive,logos_pathos,True,True,False
2,88,v2.0/positive/88.xml,positive,logos_pathos,True,True,False
3,88,v2.0/positive/88.xml,positive,logos_pathos,True,True,False
4,88,v2.0/positive/88.xml,positive,logos_pathos,True,True,False
...,...,...,...,...,...,...,...
1953,85,v2.0/negative/85.xml,negative,logos_pathos,True,True,False
1954,85,v2.0/negative/85.xml,negative,logos_pathos,True,True,False
1955,85,v2.0/negative/85.xml,negative,logos,True,False,False
1956,85,v2.0/negative/85.xml,negative,logos,True,False,False
