In [1]:
from xml.dom import minidom
import csv
import pandas as pd
import xlsxwriter


# Structure: 
`
tp-example
   topicid
   sourcefile
   title
   ilexample
       sod_ex_index
       sod_judgment
       sod_bookmarks
           sod_bookmark
       sod_generalList
           innerExample
               [sod_ex_index]  (subnumbering, e.g. 1a)
               wordgroup
                   sod_wg_w
                       sod_judgment
                       sod_ex_index
                       sod_categorialfeature
               exampleComment 
`
## Alternative structure

`
tp-example
   topicid
   sourcefile
   title
   ilexample
       sod_ex_index
       sod_judgment
       sod_bookmarks
           sod_bookmark
       sod_generalList
           innerExample
               [sod_ex_index]  (subnumbering, e.g. 1a)
               wordgroup
                   lexterm
                       word
`

In [2]:
filename = "tp_publ_examples_nl_syn_feb22.xml" #"test.xml" #"tp_publ_examples_nl_syn_feb22.xml"

In [3]:
class Sentence:
    def __init__(self, sentence, judgement, sourcefile, title, ex_number):
        self.sentence = sentence
        self.judgement = judgement
        self.sourcefile = sourcefile
        self.title = title
        self.ex_number = ex_number
    
    def has_questionmark_judgement(self):
        return '?' in self.judgement
    
    def __repr__(self):
        return "%s %s, which is nr. %s from file %s with title %s" % (self.judgement, self.sentence, self.ex_number, self.sourcefile, self.title)

In [4]:
def write_sentences_to_csv(filename, sentences):
    with open(filename, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        
        # write the header
        writer.writerow(['judgment','sentence', 'examplenumber', 'title', 'sourcefile'])
        
        for sentence in sentences:
            writer.writerow([sentence.judgement, sentence.sentence, sentence.ex_number, sentence.title, sentence.sourcefile])

def write_sentences_to_xlsx(filename, sentences):
    with open(filename, 'w', encoding='UTF8', newline='') as f:
        workbook = xlsxwriter.Workbook(filename)
        worksheet = workbook.add_worksheet()

               
        # write the header
        worksheet.write(0, 0, 'judgment')
        worksheet.write(0, 1, 'sentence')
        worksheet.write(0, 2, 'examplenumber')
        worksheet.write(0, 3, 'title')
        worksheet.write(0, 4, 'sourcefile')
        
        for i, sentence in enumerate(sentences):
            worksheet.write(i+1, 0, sentence.judgement)
            worksheet.write(i+1, 1, sentence.sentence)
            worksheet.write(i+1, 2, sentence.ex_number)
            worksheet.write(i+1, 3, sentence.title)
            worksheet.write(i+1, 4, sentence.sourcefile)
        
        workbook.close()

In [12]:
def add_word_to_string(string, word):
    whitespace_like = [' ', '/', '*', '\n', '\t', '?']
    if len(string) == 0:
        string += word
    elif string[-1] in whitespace_like:
        string += word
    else:
        string += ' ' + word
    return string

def clean_sentence(sentence):
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace('\t', ' ')
    sentence = sentence.replace('  ', ' ')
    sentence = sentence.replace('                     ', ' ')
    sentence = sentence.replace('    ', ' ')
    return sentence


def append_word(word, sentence, judgement):
    if word.nodeType == word.TEXT_NODE:
        sentence = add_word_to_string(sentence, word.data.strip())
    else:
        if word.nodeName == 'sod_judgment':
            judgement = add_word_to_string(judgement, word.firstChild.data.strip())
            sentence = add_word_to_string(sentence, word.firstChild.data.strip())
        else:
            sentence = add_word_to_string(sentence,  word.toprettyxml())
        
        '''
        elif word.nodeName == 'sod_emphasisitalics':
            ### TODO TODO TODO add something 
            if word.firstChild.nodeType == word.TEXT_NODE:
                sentence = add_word_to_string(sentence, '<i>' + word.firstChild.data.strip() + '</i>')
            else:
                for word2 in word.firstChild.data:
                    sentence = add_word_to_string(sentence, '<w>' + word2.firstChild.data.strip() + '</w>')
        elif word.nodeName == 'u':
            sentence = add_word_to_string(sentence, '<u>' + word.firstChild.data.strip() + '</u>')
        '''
    return sentence, judgement


def parse_xml_file(filename):
    f = minidom.parse(filename)
    ex = f.getElementsByTagName('examples')[0]
    examples = ex.getElementsByTagName('tp-example')
    

    problems = 0
    sentences = []
    
    for example in examples:

        try: 
            sourcefile = example.getElementsByTagName('sourcefile')[0].firstChild.data
            title = example.getElementsByTagName('title')[0].firstChild.toprettyxml()
            ilexamples = example.getElementsByTagName('ilexample')
            for ilexample in ilexamples:
                
                '''
                judgement = ""
                judgements = ilexample.getElementsByTagName('sod_judgment')

                if len(judgements)>0:
                    judgement = add_word_to_string(judgement, judgements[0].firstChild.data)
                '''
                try: 
                    il_ex_nr = ilexample.getElementsByTagName('sod_ex_index')[0].firstChild.data
                except:
                    il_ex_nr = ""
                sod_generalists = ilexample.getElementsByTagName('sod_generalList')
                for sod_generalist in sod_generalists:
                    inner_examples = sod_generalist.getElementsByTagName('innerExample')

                    for inner_ex in inner_examples:
                        try: 
                            in_ex_nr = inner_ex.getElementsByTagName('sod_ex_index')[0].firstChild.data
                        except:
                            in_ex_nr = ""
                        
                        #print(inner_ex)
                        wordgroups = inner_ex.getElementsByTagName('wordgroup')
                        for wordgroup in wordgroups:
                            sod_wg_w = wordgroup.getElementsByTagName('sod_wg_w')
                            sentence = ""
                            judgement = ""
                            for words in sod_wg_w:
                                for word in words.childNodes:
                                    #print(word.toprettyxml())
                                    sentence, judgement = append_word(word, sentence, judgement)
                
                            lexterms = wordgroup.getElementsByTagName('lexterm')
                            for lexterm in lexterms:
                                words = wordgroup.getElementsByTagName('word')
                                for word in words:
                                    sentence = add_word_to_string(sentence, word.toprettyxml())
                            sentences.append(Sentence(clean_sentence(sentence), judgement, sourcefile, title, il_ex_nr + in_ex_nr))
        except Exception as e:
            print(example.toprettyxml())
            print(str(e))
            problems += 1
            #print(example.toprettyxml())
        #print(wordgroup.toprettyxml())
    if problems>0:
        print("Encountered: " + str(problems) + " problem sentences out of " + str(len(examples)) + " sentences")
    return sentences



In [15]:
sentences = parse_xml_file(filename)
print(sentences)
write_sentences_to_xlsx('sentences.xlsx', sentences)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
def filter_sentences(sentences):
    filtered_sentences = []
    for sentence in sentences:
        if sentence.has_questionmark_judgement():
            filtered_sentences.append(sentence)
    return filtered_sentences

filtered_sentences = filter_sentences(sentences)
write_sentences_to_xlsx('sentences_filtered.xlsx', filtered_sentences)

`<innerExample id="vp.3.3.3.0073">
<sod_ex_index>c.</sod_ex_index>
<wordgroup><sod_wg_w id="w.502.c.0">Het<sub>i</sub> </sod_wg_w><sod_wg_w id="w.502.c.1">krioelt </sod_wg_w><sod_wg_w id="w.502.c.2">[in de tuin]<sod_ex_index>i</sod_ex_index> </sod_wg_w><sod_wg_w id="w.502.c.3">van de mieren.</sod_wg_w></wordgroup>
<gloss><sod_gl_w id="g.502.c.0">it </sod_gl_w><sod_gl_w id="g.502.c.1">crawls </sod_gl_w><sod_gl_w id="g.502.c.2">in the garden </sod_gl_w><sod_gl_w id="g.502.c.3">of the ants</sod_gl_w></gloss>
</innerExample>`

sod_ex_index in the most inner structure simply to indicate subscript i and not used for actual example numbering.