In [82]:
import os
import pickle
import glob
import pandas as pd
import xml.etree.ElementTree as ET

# Construct empty DataFrame
df_sentences = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'text', 'type'
])

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    documents = []

    for elem in root.iter():
        if elem.tag == "OP" or elem.tag == "reply":
            all_descendants = [b for b in elem.iter() if b is not elem]
            for desc in all_descendants:
                documents.append((desc.text, desc.tag))
    return documents

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, argument in parse_xml(path):
                df_sentences = pd.concat([df_sentences, pd.DataFrame({
                    'document': [path.split('/')[2].split('.')[0]],
                    'file_path': [path],
                    'split': [path.split('/')[1]],
                    'text': [premise],
                    'type': [argument],
                })], axis=0, ignore_index=True)
        except ET.ParseError:
            pass

df_sentences.to_pickle('./v2.0-processed/df_sentences.pickle')

In [83]:
df_sentences

Unnamed: 0,document,file_path,split,text,type
0,88,v2.0/positive/88.xml,positive,[47% of all jobs are at risk of being automate...,premise
1,88,v2.0/positive/88.xml,positive,This number will grow grow until the vast majo...,premise
2,88,v2.0/positive/88.xml,positive,Since some of this automation will inevitably ...,premise
3,88,v2.0/positive/88.xml,positive,By fortifying themselves in their gated commun...,premise
4,88,v2.0/positive/88.xml,positive,"Once everyone is dead, they can simply be wipe...",premise
...,...,...,...,...,...
3642,85,v2.0/negative/85.xml,negative,If the urban folk get rid of the rural folk th...,claim
3643,85,v2.0/negative/85.xml,negative,They couldn't just survive together,claim
3644,85,v2.0/negative/85.xml,negative,the ecosystem will not be able to sustain the ...,premise
3645,85,v2.0/negative/85.xml,negative,The rural folk can survive without the urban folk,claim
