In [1]:
import spacy
import random
from spacy.util import minibatch, compounding

In [2]:
def iobToSpacySimple(iobDataset):
    
    TRAIN_DATA = []
    
    with open(iobDataset, 'r') as fp:
        for line in fp:    
            wordTagPairsList = []
            
            wordTagStringList = line.split(' ')[:-1]
            
            for wordTagString in wordTagStringList:
                wordTagPairList = wordTagString.split('|')
                wordTagPairsList.append(wordTagPairList)
            
            entityStartIndex = 0
            entityDict = {}
            entityList = []
            sentText = ''
            currentIndex = 0
            
            for index, pair in enumerate(wordTagPairsList):
                
                if pair[1].startswith('B'):
                    entityEndIndex = currentIndex + len(pair[0])
                    entityStartIndex = currentIndex
                    
                    for pairInner in wordTagPairsList[index + 1:]:
                        if pairInner[1].startswith('O'):
                            break
                        
                        entityEndIndex += (len(pairInner[0]) + 1)
                    
                    entityTuple = (entityStartIndex, entityEndIndex, pair[1][2:])
                    entityList.append(entityTuple) 
                
                sentText += pair[0] + ' '             
                currentIndex += len(pair[0]) + 1
                
            entityDict['entities'] = entityList
            trainTuple = (sentText, entityDict)
            TRAIN_DATA.append(trainTuple)
    
    #Remove list elemens in TRAIN_DATA with overlapping entities
    overlappingIndexes = []
    for index, tup in enumerate(TRAIN_DATA):
        endIndex = -1
        _, annotation = tup
        for ent in annotation.get('entities'):
            if (ent[0] < endIndex):
                overlappingIndexes.append(index)
            endIndex = ent[1]
    indexAdjustment = 0        
    for ind in overlappingIndexes:
        TRAIN_DATA.pop(ind - indexAdjustment)
        indexAdjustment += 1
    return TRAIN_DATA

In [3]:
def trainNer(TRAIN_DATA, pathToModel, iterations):
    nlp = spacy.load(pathToModel)
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.begin_training()
        
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            
            print(losses)
            
    return nlp
        

In [4]:
TRAIN_DATA = iobToSpacySimple('../Data/Datasets/IOB2/SUC30IOB2SelTags')
nlp = trainNer(TRAIN_DATA, '../SpacyModels/SpacySwedishFastText', 20)

Statring iteration 0
{'ner': 40644.568929531335}
Statring iteration 1
{'ner': 29286.383331312856}
Statring iteration 2
{'ner': 25552.08485580275}
Statring iteration 3
{'ner': 23370.905696283728}
Statring iteration 4
{'ner': 21776.90197192377}
Statring iteration 5
{'ner': 20485.846212104258}
Statring iteration 6
{'ner': 19686.237747219202}
Statring iteration 7
{'ner': 18708.36847462311}
Statring iteration 8
{'ner': 17941.87900652521}
Statring iteration 9
{'ner': 17344.94272549692}
Statring iteration 10
{'ner': 16826.886470155318}
Statring iteration 11
{'ner': 16509.54876476673}
Statring iteration 12
{'ner': 15995.811827717605}
Statring iteration 13
{'ner': 15668.082881319258}
Statring iteration 14
{'ner': 15345.825546561313}
Statring iteration 15
{'ner': 14964.826534402462}
Statring iteration 16
{'ner': 14931.168863285906}
Statring iteration 17
{'ner': 14357.246868252203}
Statring iteration 18
{'ner': 14341.793864674208}
Statring iteration 19
{'ner': 14098.764572210024}


In [5]:
nlp.to_disk('../SpacyModels/SpacySwedishFastText')

In [6]:
doc = nlp('Våla härad omfattade socknarna Östervåla, Nora, Harbo och Huddunge.Häradsrätten upphörde vid utgången av år 1887 och uppgick i Västmanlands östra domsagas häradsrätt.')
for ent in doc.ents:
    print(ent)

Östervåla
Nora
Harbo
Huddunge
år 1887
Västmanlands
