In [13]:
import os
import pickle
import glob
import pandas as pd
import xml.etree.ElementTree as ET

# Construct empty DataFrame
df_sentences = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'text', 'type'
])

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    documents = []

    for elem in root.iter():
        if elem.tag == "OP" or elem.tag == "reply":
            all_descendants = [b for b in elem.iter() if b is not elem]
            for desc in all_descendants:
                documents.append((desc.text, desc.tag))
    return documents

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, argument in parse_xml(path):
                df_sentences = pd.concat([df_sentences, pd.DataFrame({
                    'document': [path],
                    'file_path': [path],
                    'split': [path.split("/")[1]],
                    'text': [premise],
                    'type': [argument],
                })], axis=0, ignore_index=True)
        except ET.ParseError:
            pass

df_sentences.to_pickle('./v2.0-processed/df_sentences.pickle')

In [14]:
df_sentences

Unnamed: 0,document,file_path,split,text,type
0,v2.0/positive\0.xml,v2.0/positive\0.xml,positive\0.xml,I point out that many Christians follow the bi...,claim
1,v2.0/positive\0.xml,v2.0/positive\0.xml,positive\0.xml,"For example, Congregationalists and Universali...",premise
2,v2.0/positive\0.xml,v2.0/positive\0.xml,positive\0.xml,my belief is that while religion can inform th...,claim
3,v2.0/positive\0.xml,v2.0/positive\0.xml,positive\0.xml,I think that most Jewish people don't want to ...,claim
4,v2.0/positive\0.xml,v2.0/positive\0.xml,positive\0.xml,Your stance relies on the assumption that rel...,claim
...,...,...,...,...,...
3642,v2.0/negative\99.xml,v2.0/negative\99.xml,negative\99.xml,"had to take his keys from him, spent 20 minute...",premise
3643,v2.0/negative\99.xml,v2.0/negative\99.xml,negative\99.xml,He seemed ok appart from the shock and high le...,premise
3644,v2.0/negative\99.xml,v2.0/negative\99.xml,negative\99.xml,I work in a surgery unit,premise
3645,v2.0/negative\99.xml,v2.0/negative\99.xml,negative\99.xml,the kind of gore I witness there doesn't shock...,premise
