In [1]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

save = True

# Construct empty DataFrame
df_premises = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'premise', 'type', 'logos', 'pathos', 'ethos'
])

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    return [(elem.text, elem.attrib['type']) for elem in tree.iter() if elem.tag == 'premise']

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, premise_type in parse_xml(path):
                df_premises = pd.concat([df_premises, pd.DataFrame({
                    'document': [path.split('/')[2].split('.')[0]],
                    'file_path': [path],
                    'split': [path.split('/')[1]],
                    'premise': premise,
                    'type': [premise_type],
                    'logos': ['logos' in premise_type],
                    'pathos': ['pathos' in premise_type],
                    'ethos': ['ethos' in premise_type],
                })], axis=0, ignore_index=True)
        except ET.ParseError as e:
            print(f'Could not parse: {path}')
            print(f'{e}\n')
if save:
    df_premises.to_pickle('./v2.0-processed/df_premises_prot4.pickle', protocol=4)

df_premises

Unnamed: 0,document,file_path,split,premise,type,logos,pathos,ethos
0,68,v2.0/positive/68.xml,positive,"Like, most people don't memorize all of Romeo ...",logos_pathos,True,True,False
1,68,v2.0/positive/68.xml,positive,"But most people, I think, if they really wante...",logos_pathos,True,True,False
2,68,v2.0/positive/68.xml,positive,I'm 100% on board with trashing the US constit...,logos_pathos,True,True,False
3,68,v2.0/positive/68.xml,positive,I think the American legal system has gotten o...,pathos,False,True,False
4,68,v2.0/positive/68.xml,positive,"By categories, I mean things like: criminal la...",pathos,False,True,False
...,...,...,...,...,...,...,...,...
2067,334,v2.0/negative/334.xml,negative,Perhaps laissez faire capitalism (or of the c...,logos,True,False,False
2068,334,v2.0/negative/334.xml,negative,"In fact, Adam Smith (origin of laissez faire)...",ethos_logos,True,False,True
2069,334,v2.0/negative/334.xml,negative,saying that they did very little to improve t...,ethos_logos,True,False,True
2070,334,v2.0/negative/334.xml,negative,just sit on their land charging whatever damn...,pathos,False,True,False


## Fixed XML files

- positive/1.xml
- positive/70.xml
- positive/74.xml
- positive/84.xml
- positive/85.xml
- negative/289.xml

Amount of premises before: 1958
Amount of premises after: 2072 (+ 114)

Fixes include:
- invalid characters (e.g. '&' instead of '&amp;')
- multiple root tags
- tag not closed properly