In [1]:
#from __future__ import unicode_literals
import xml.etree.ElementTree as ET
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [2]:
def parseSuc30ToSpacySimplePos(file):
    
    TRAIN_DATA = []
    
    tree = ET.parse('../Data/Datasets/suc3.xml')
    root = tree.getroot()
        
    for sentence in root.iter('sentence'):
        text = ''
        words = []
        tags = []
        wordTagDict = {}
        
        for word in sentence.iter('w'):
            words.append(word.text)
            tags.append(word.attrib['msd'].replace('.', '|'))
            text += word.text + ' '
        wordTagDict['words'] = words
        wordTagDict['tags'] = tags
        sentenceTuple = (text, wordTagDict)
    
        TRAIN_DATA.append(sentenceTuple)
        
    return TRAIN_DATA
        

In [3]:
def trainTagger(TRAIN_DATA, model=None, outputDir=None):

    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("sv")  # create blank Language class
        print("Created blank 'sv' model")
        
    if "tagger" not in nlp.pipe_names:
        tagger = nlp.create_pipe("tagger")
        nlp.add_pipe(tagger, first=True)
    else:
        tagger = nlp.get_pipe("tagger")
        
    pipe_exceptions = ["tagger"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes):  # only train tagger
        optimizer = nlp.begin_training()
        for i in range(25):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            # batch up the examples using spaCy's 
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)
    
    # save model to output directory
    if outputDir is not None:
        output_dir = Path(outputDir)
        if not outputDir.exists():
            outputDir.mkdir()
        nlp.to_disk(outputDir)
        print("Saved model to ", outputDir)
    

Losses {'tagger': 14623.98728149502}
Losses {'tagger': 12667.47328738841}
Losses {'tagger': 11421.159311390486}
Losses {'tagger': 10471.539729238986}
Losses {'tagger': 9621.710414102236}
Losses {'tagger': 8775.044490039223}
Losses {'tagger': 8224.528262796626}
Losses {'tagger': 7450.294569667707}
Losses {'tagger': 7142.808351896532}
Losses {'tagger': 6656.530497621032}
Losses {'tagger': 6199.323727387997}
Losses {'tagger': 5946.865974176289}
Losses {'tagger': 5660.67521431242}
Losses {'tagger': 5353.00083246531}
Losses {'tagger': 5106.83083895285}
Losses {'tagger': 4899.589639359908}
Losses {'tagger': 4694.74191339925}
Losses {'tagger': 4604.03942746342}
Losses {'tagger': 4285.264890220218}
Losses {'tagger': 4116.974608598131}
Losses {'tagger': 4044.138870714709}
Losses {'tagger': 3768.2594963123256}
Losses {'tagger': 3688.1159035463065}
Losses {'tagger': 3682.376812942355}
Losses {'tagger': 3440.0097550839073}


In [None]:
TRAIN_DATA = parseSuc30ToSpacySimplePos('pathToSuc30')
trainTagger(TRAIN_DATA, 'pathToModel', 'pathToOutputDir')
