In [104]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

save = True

# Construct empty DataFrame
df_premises = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'premise', 'type', 'logos', 'pathos', 'ethos'
])

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    return [(elem.text, elem.attrib['type']) for elem in tree.iter() if elem.tag == 'premise']

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, premise_type in parse_xml(path):
                df_premises = pd.concat([df_premises, pd.DataFrame({
                    'document': [path.split('/')[2].split('.')[0]],
                    'file_path': [path],
                    'split': [path.split('/')[1]],
                    'premise': premise,
                    'type': [premise_type],
                    'logos': ['logos' in premise_type],
                    'pathos': ['pathos' in premise_type],
                    'ethos': ['ethos' in premise_type],
                })], axis=0, ignore_index=True)
        except ET.ParseError as e:
            print(f'Could not parse: {path}')
            print(f'{e}\n')
if save:
    df_premises.to_pickle('./v2.0-processed/df_premises.pickle')

df_premises

Unnamed: 0,document,file_path,split,premise,type,logos,pathos,ethos
0,88,v2.0/positive/88.xml,positive,[47% of all jobs are at risk of being automate...,logos,True,False,False
1,88,v2.0/positive/88.xml,positive,This number will grow grow until the vast majo...,logos_pathos,True,True,False
2,88,v2.0/positive/88.xml,positive,Since some of this automation will inevitably ...,logos_pathos,True,True,False
3,88,v2.0/positive/88.xml,positive,By fortifying themselves in their gated commun...,logos_pathos,True,True,False
4,88,v2.0/positive/88.xml,positive,"Once everyone is dead, they can simply be wipe...",logos_pathos,True,True,False
...,...,...,...,...,...,...,...,...
2067,85,v2.0/negative/85.xml,negative,the some few people that still know how to hun...,logos_pathos,True,True,False
2068,85,v2.0/negative/85.xml,negative,Machines don't have money either so they won't...,logos_pathos,True,True,False
2069,85,v2.0/negative/85.xml,negative,Cryptocurrency may help a little because it wo...,logos,True,False,False
2070,85,v2.0/negative/85.xml,negative,once they can administrate their own systems w...,logos,True,False,False


## Fixed XML files

- positive/1.xml
- positive/70.xml
- positive/74.xml
- positive/84.xml
- positive/85.xml
- negative/289.xml

Amount of premises before: 1958
Amount of premises after: 2072 (+ 114)

Fixes include:
- invalid characters (e.g. '&' instead of '&amp;')
- multiple root tags
- tag not closed properly