Tesina de bachelor: filologia hispánica

# Collect filenames
This part is concerned with collecting all the filenames of the relevant files

In [3]:
import os
import fnmatch

In [4]:
def find_files(root_folder='../BVC'):
    """
    Code from stack overflow: https://stackoverflow.com/questions/2186525/how-to-use-glob-to-find-files-recursively
    """
    
    matches = []
    for root, dirnames, filenames in os.walk(root_folder):
        for filename in fnmatch.filter(filenames, '*.xml'):
            matches.append(os.path.join(root, filename))
    return matches

In [8]:
print(find_files())
files = find_files()

['../BVC/8784.xml', '../BVC/449.xml', '../BVC/461.xml', '../BVC/164.xml', '../BVC/458.xml', '../BVC/8700.xml', '../BVC/23.xml', '../BVC/8781.xml', '../BVC/test.xml', '../BVC/138.xml', '../BVC/457.xml', '../BVC/8778.xml', '../BVC/8648.xml', '../BVC/8677.xml', '../BVC/5412.xml', '../BVC/1047.xml', '../BVC/8787.xml', '../BVC/90.xml', '../BVC/8709.xml', '../BVC/8703.xml', '../BVC/103.xml', '../BVC/8786.xml', '../BVC/8675.xml', '../BVC/8455.xml', '../BVC/1313.xml', '../BVC/8108.xml', '../BVC/455.xml', '../BVC/8699.xml', '../BVC/10107.xml', '../BVC/8695.xml', '../BVC/8788.xml', '../BVC/8697.xml', '../BVC/8696.xml', '../BVC/8650.xml', '../BVC/8652.xml', '../BVC/6837.xml', '../BVC/8676.xml', '../BVC/8651.xml', '../BVC/8643.xml', '../BVC/8776.xml', '../BVC/1708.xml', '../BVC/8681.xml', '../BVC/8698.xml', '../BVC/8789.xml', '../BVC/8701.xml', '../BVC/453.xml', '../BVC/456.xml', '../BVC/184.xml', '../BVC/463.xml', '../BVC/462.xml', '../BVC/8678.xml', '../BVC/8692.xml', '../BVC/1011.xml', '../BVC/

# Parse xml file
This part is concerned with extracting the relevant information from the xml file

In [9]:
class Sentence:
    def __init__(self, year, title, author, original_text, modern_text, nlp):
        '''
        year: the year in which the sentence was written
        text_title: title of the text
        original_text: the complete original text
        modern_text: the modern text with fixed spelling
        tagged_text: the nlp tagged text
        '''
        self.year = year
        self.text_title = title
        self.author = author
        self.original_text = original_text
        self.modern_text = modern_text
        self.tagged_text = nlp(modern_text)
        
        self.set_has_object()
        self.set_has_object_with_a()
        
    
    def __cmp__(self, other):
        return cmp(self.year, other.year)
    
    def __repr__(self):
        return "%s (%s) [%s, %s, %s]" % (self.original_text, self.modern_text, self.text_title, self.author, self.year)
    
    def display_tagged_text(self):
        displacy.render(self.tagged_text, style='dep', jupyter = True, options = {'distance': 120})
    
    # ---------------------------------------------
    # Setters
    
    def set_has_object(self):
        for w in self.tagged_text:
            if w.dep_ == 'obj':
                self.has_object = True
                return
        self.has_object = False
    
    def set_has_object_with_a(self):
        for w in self.tagged_text:
            if w.dep_ == 'obj':
                for v in self.tagged_text:
                    #print("Text ", v.text)
                    if str(v.head) == w.text:
                        self.has_object_with_a = (v.text == 'a' or v.text == 'al')
                        return
        self.has_object_with_a = False
    
    def set_info(self):
        if not self.has_object:
            self.verb = None
            self.determinacy = None
            self.animacy = None
            self.position_object = None
        else:
            return # See stappenplan below
    
    
    
    # --------------------------------------------
    # Getters
    
    def get_has_object():
        return self.has_object()

In [7]:

class MyTest(unittest.TestCase):
    def test_has_object_with_a(self):
        '''
        Tests the has_object_with_a function with different examples
        Also used to see if something breaks after adjusting
        '''
    
        #one syllable word, with bridge
        self.assertEqual([True], build_syllable_representation('ˈbu̠t͡s'))
        

NameError: name 'unittest' is not defined

In [10]:
from xml.dom import minidom

import lxml.etree as etree
from spacy import displacy



In [11]:
import es_core_news_sm
nlp = es_core_news_sm.load()

In [12]:
def fix_sentence(sentence):
    '''
    Parser reads things like "v [ uest ] ros"
    We try to fix that to help with the POSing later
    '''
    s = sentence.replace(' [ ', '')
    s = s.replace(' ] ', '')
    return s
    

def parse_file(filename, nlp):
    f = minidom.parse(filename)
    
    sentence_ending_chars = ["."]
    
    root = f.getElementsByTagName('TEI')[0]
    bibl = root.getElementsByTagName('teiHeader')[0].getElementsByTagName('fileDesc')[0].getElementsByTagName('sourceDesc')[0].getElementsByTagName('bibl')[0]
    year = bibl.getElementsByTagName('date')[0].firstChild.data
    title = bibl.getElementsByTagName('title')[0].firstChild.data
    author = bibl.getElementsByTagName('author')[0].firstChild.data
    
    session = root.getElementsByTagName('text')[0].getElementsByTagName('body')[0].getElementsByTagName('div')[0].getElementsByTagName('ab')[0]
    sentence_original = ""
    sentence_modern = ""
    sentences = []
    #print(session.toprettyxml())
    for word in session.getElementsByTagName('*'):
        if word.tagName in ['pc', 'w', 'c']:
            #print(word.toprettyxml())
            if len(word.getElementsByTagName('choice'))<=0:
                orig = word.firstChild.data
                reg = orig
            else: 
                choice = word.getElementsByTagName('choice')[0]
                orig = choice.getElementsByTagName('orig')[0].firstChild.data
                reg = choice.getElementsByTagName('reg')[0].firstChild.data
            if word.tagName == "pc" and orig != ",": # in sentence_ending_chars:
                sentence_original += orig
                sentence_modern += reg
                sentence_modern = fix_sentence(sentence_modern)
                sentences.append(Sentence(year, title, author, sentence_original, sentence_modern, nlp))
                sentence_original = ""
                sentence_modern = ""
            elif not reg.isupper() and reg != 'A': #we do not do upper case ones because those are names in plays
                sentence_original += " " + orig
                sentence_modern += " " + reg
    return sentences

#print(parse_file("../BVC/10107.xml", nlp))

In [13]:
test_file = parse_file("../BVC/10107.xml", nlp)

In [16]:
print(test_file[34])
test_file[34].display_tagged_text()
print(test_file[34].has_object)

 Con dos mugeres intenta ir a caza de leones. ( con dos mujeres intenta ir a caza de leones.) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]


False


In [17]:
def collect_object_sentences(sentences):
    '''
    sentences: list of sentences (so list of Sentence type objects)
    
    returns: list of sentences that contain an object
    '''
    object_sentences = []
    object_sentences_a = []
    for sentence in sentences:
        if sentence.has_object:
            object_sentences.append(sentence)
            
        if sentence.has_object_with_a:
            object_sentences_a.append(sentence)
    return object_sentences, object_sentences_a

In [18]:
# Interesting practice sentence: Muélanle a palos al secretario

obj_sen, obj_sen_a = collect_object_sentences(test_file)
print(len(obj_sen), len(obj_sen_a))
print(obj_sen_a[34])
obj_sen_a[34].display_tagged_text()

880 70
 por qué quise , y se le di tan libremente a mi hermana? ( por qué quise , y se le di tan libremente a mi hermana?) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]


In [19]:
for s in obj_sen_a:
    print(s)

 [ ] Triunfe el suecio arrogante que me ha quitado mi reyno , que en mi mismo valor reyno al que perdí semejante; (] Triunfe el suecio arrogante que me ha quitado mi reino , que en mi mismo valor reino al que perdí semejante;) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]
 PAJE Pues remítome a la prueva. ( paje pues remítome a la prueva.) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]
 [ ] Vós , con mucha razón , fuérades digno de aquese pensamiento , pareciendo en forma humana a Júpiter divino; (] Vós , con mucha razón , fuérais digno de aquese pensamiento , pareciendo en forma humana a Júpiter divino;) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]
 Hijas , humildes son v [ uest ] ros estados para tan grande Príncipe , que apenas pueden aposentar a sus criados; ( hijas , humildes son vuestros estados para tan grande príncipe , que apenas pueden aposentar a sus criados;) [Comedia del Príncipe Ynocente, Lope de Vega, 1590]
 [ ] Darán posada a la gente del Príncipe , y a

# Things that would be nice to add:
- source year besides just year in which text was written
- further data cleaning
- match xml annotation with pos tagger annotation

Stappenplan
- Googlen of er code is die alleen substukjes van zinnen pakt
- Als niet zelf recursieve functie maken die alleen ww (wat obj heeft) en alles wat eronder hangt pakt. Mss alleen totdat er een nieuw werkwoord bijkomt? 
- Daarmee dan bepalen waar ww staat en waar obj staat (links of rechts van ww)
- Zelfde manier code maken die alleen object en alles wat daaronder hangt pakt
- Daarmee bepalen wat de determinancy is (en evt animacy)