In [1]:
from __future__ import unicode_literals
import xml.etree.ElementTree as ET
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
import re

In [2]:
def parseSuc30ToSpacySimplePos(file):
    
    TRAIN_DATA = []
    
    tree = ET.parse(file)
    root = tree.getroot()
        
    for sentence in root.iter('sentence'):
        text = ''
        words = []
        tags = []
        wordTagDict = {}
        
        for word in sentence.iter('w'):
            
            if ':' not in word.text and '/' not in word.text and "'" not in word.text and '+' not in word.attrib['msd']:
                tags.append(word.attrib['msd'].replace('.', '|').replace('MAD', '.'))
                text += word.text + ' '
                #words.append(word.text)
        doc = nlp(text)
        if len(doc) == len(tags):
            #wordTagDict['words'] = words
            wordTagDict['tags'] = tags
            
            sentenceTuple = (text, wordTagDict)
        
            TRAIN_DATA.append(sentenceTuple)
            
    return TRAIN_DATA
        

In [8]:
def trainTagger(TRAIN_DATA, model=None, outputDir=None):

    """if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("sv")  # create blank Language class
        print("Created blank 'sv' model")"""
        
    if "tagger" not in nlp.pipe_names:
        tagger = nlp.create_pipe("tagger")
        nlp.add_pipe(tagger, first=True)
    else:
        tagger = nlp.get_pipe("tagger")
        
    pipe_exceptions = ["tagger"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes):  # only train tagger
        optimizer = nlp.begin_training()
        for i in range(25):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            # batch up the examples using spaCy's 
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            print("Losses", losses)
        
    # save model to output directory
    if outputDir is not None:
        output_dir = Path(outputDir)
        if not outputDir.exists():
            outputDir.mkdir()
        nlp.to_disk(outputDir)
        print("Saved model to ", outputDir)
    

In [5]:
nlp = spacy.load('../../Models/SpacySwedishModel')


In [9]:
TRAIN_DATA = parseSuc30ToSpacySimplePos('../../Data/Datasets/xml/suc3train.xml')

trainTagger(TRAIN_DATA, '../../Models/SpacySwedishModel', '../../Models/SpacySwedishModel')

Losses {'tagger': 79739.24086070061}
Losses {'tagger': 54305.30455170572}
Losses {'tagger': 47417.51377936825}
Losses {'tagger': 43079.36940412782}
Losses {'tagger': 40059.01021840051}
Losses {'tagger': 37687.91819554521}
Losses {'tagger': 35364.03194290213}
Losses {'tagger': 33815.38570255117}
Losses {'tagger': 32340.20174273057}
Losses {'tagger': 31284.260467929416}
Losses {'tagger': 30081.50088546425}
Losses {'tagger': 29151.41229553765}
Losses {'tagger': 28157.150802778757}
Losses {'tagger': 27401.383220454532}
Losses {'tagger': 26503.349095831276}
Losses {'tagger': 25860.21192849087}
Losses {'tagger': 25472.729831750738}
Losses {'tagger': 24968.859341716423}
Losses {'tagger': 24333.463596730144}
Losses {'tagger': 23833.918665286506}
Losses {'tagger': 23159.60243100807}
Losses {'tagger': 23090.96203229057}
Losses {'tagger': 22486.90328554903}
Losses {'tagger': 22033.832355019804}
Losses {'tagger': 21736.637365559407}


In [11]:
doc = nlp('Hej jag heter Erik. Jag har just tränat hårt.')     
for token in doc:
    print(token.tag_)


IN
PN|UTR|SIN|DEF|SUB
VB|PRS|AKT
PM|NOM
MID
PN|UTR|SIN|DEF|SUB
VB|PRS|AKT
AB
VB|SUP|AKT
AB|POS
MID
