In [1]:
import spacy
import pickle
import neuralcoref.train.document as dc
from xml.etree import ElementTree
import pandas as pd
from transformers import pipeline
from Coref.KBBert.SweBertNer import SweBertNer


Loads the Swedish coref spans

In [2]:
with open('pickleObjects/corefSpansObject', 'rb') as f:
    corefSpansDocument = pickle.load(f)

Read the Swedish documents into parts of 20 lines each

In [3]:
def readIntoParts(textFile):
    sweParts = []
    sweCorp = ''
    with open(textFile) as f:
        for index, line in enumerate(f):
            sweCorp = sweCorp + line
            if (index + 1) % 20 == 0:
                sweParts.append(sweCorp)
                sweCorp = ''
    return sweParts

Loads the Swedish Spacy model with the transormer Swedish Bert ner

In [4]:
def loadModel(model):
    nlp = spacy.load(model)
    nlpTrans = pipeline('ner', model='KB/bert-base-swedish-cased-neriob', tokenizer='KB/bert-base-swedish-cased-neriob')
    nlp.remove_pipe('ner')

    trans = SweBertNer(nlpTrans, nlp)
    nlp.add_pipe(trans)
    return nlp

Process the Swedish parts

In [5]:
def processDocs(nlp, sweParts):
    docs = []
    for part in sweParts:
        doc = nlp(part)
        docs.append(doc)
    return docs

Translates the Swedish coref spans to valid coref annotations in the Swedish
documents by extracting the max mention from the spans.

In [6]:
def translateCorefs(nlp, pdocs):

    corefSpansSweDocument = []

    for corefSpans, doc in zip(corefSpansDocument, pdocs):
        numOfSingCoref = 0
        corefSpansSwe = []
        for clusterSub in corefSpans:
            corefClusterSwe = []

            for ment in clusterSub:
                corefSwe = []
                startOfLine = 0
                endOfLine = 0
                lineText = ''
                mentionText = ''
                if ment[0] != -1:
                    startOfLine = ment[0]
                    endOfLine = ment[-1]
                    while doc[startOfLine].text != '\n' and startOfLine != 0:
                        startOfLine -= 1
                    startOfLine += 1
                    while doc[endOfLine].text != '\n':
                        endOfLine += 1

                    for tok in range(startOfLine, endOfLine):
                        lineText = lineText + doc[tok].text + ' '
                    lineText.rstrip()
                else:
                    continue

                #mentionSpanDoc = nlp(mentionText)
                lineDoc = nlp(lineText)

                #mentionsInSpan = dc.extract_mentions_spans(mentionSpanDoc, [])
                mentionsInLine = dc.extract_mentions_spans(lineDoc, [])

                mentionsInSpan = []

                for span in mentionsInLine:
                    if (span.start + startOfLine) >= ment[0] and (span.end + startOfLine) <= ment[-1]:
                        mentionsInSpan.append(span)

                if len(mentionsInSpan) == 0:
                    continue

                invalidCoref = False

                """for span in mentionsInLine:
                    if (span.end + startOfLine) > mention[1] > (span.start + startOfLine) > mention[0] or (span.start + startOfLine) < mention[0] < (span.end + startOfLine) < mention[1]:
                        invalidCoref = True"""

                maxLength = 0
                maxSpan = 0
                for subSpan in mentionsInSpan:
                    if subSpan.end - subSpan.start > maxLength:
                        maxLength = subSpan.end - subSpan.start
                        maxSpan = subSpan

                """for span in mentionsInSpan:
                    if span.start < maxSpan.start or span.end > maxSpan.end:
                        invalidCoref = True"""

                if invalidCoref:
                    continue
                elif not invalidCoref:
                    corefSwe.append(maxSpan.start + startOfLine)
                    corefSwe.append(maxSpan.end + startOfLine)
                    corefClusterSwe.append(corefSwe)

            if len(corefClusterSwe) > 1:
                res = list(set(tuple(sorted(sub)) for sub in corefClusterSwe))
                corefSpansSwe.append(res)
            else:
                numOfSingCoref += 1

        corefSpansSweDocument.append(corefSpansSwe)
    return corefSpansSweDocument

Extract speakers for each token in the documents

In [13]:
def extractSpeakers(tokenizer, sweParts, sweDocXml, linksXml):

    dictSpeakerSents = {}
    tree = ElementTree.parse(sweDocXml)
    root = tree.getroot()
    for speaker in root.iter('SPEAKER'):
        for sent in speaker.iter('s'):
            dictSpeakerSents[sent.attrib['id']] = speaker.attrib['NAME'].replace(' ', '_')

    dictSpeakerLines = {}
    tree = ElementTree.parse(linksXml)
    root = tree.getroot()
    lineNumber = 0

    for link in root.iter('link'):
        sweLinks = link.attrib['xtargets'].split(';')[1].split()
        noSpeakerInLine = False

        for sentNumber in sweLinks:
            if sentNumber not in dictSpeakerSents:
                noSpeakerInLine = True

        for sentNumber in sweLinks:
            if not noSpeakerInLine:
                dictSpeakerLines[lineNumber] = dictSpeakerSents[sentNumber]

        lineNumber += 1

    dictSpeakerTokensDocument = []
    for index1, swePart in enumerate(sweParts):
        dictSpeakerTokens = {}
        tokenIndex = 0

        sweLines = swePart.split('\n')
        for index2, line in enumerate(sweLines):
            sweLines[index2] = line + '\n'
        sweLines.pop()

        for lineIndex, line in enumerate(sweLines):
            lineDoc = tokenizer(line)
            key = index1 * 18 + lineIndex
            if key in dictSpeakerLines:
                for i, token in enumerate(lineDoc):
                    dictSpeakerTokens[tokenIndex + i] = dictSpeakerLines[key]
            else:
                for i, token in enumerate(lineDoc):
                    dictSpeakerTokens[tokenIndex + i] = None
            tokenIndex += len(lineDoc)

        dictSpeakerTokensDocument.append(dictSpeakerTokens)

    return dictSpeakerTokensDocument

Creates a coref annotation dict to be stored in the dataframes

In [8]:
def createCorefAnnotDict(corefSpansSweDoc):

    corefDictsDocument = []

    for corefSpans in corefSpansSweDoc:
        corefClusterId = 1
        corefDict = {}
        for clust in corefSpans:
            for span in clust:
                if span[1] - span[0] == 1:
                    key = str(span[0])
                    if corefDict.get(key) is not None:
                        corefDict[key] = corefDict[key] + '|(' + str(corefClusterId) + ')'
                    else:
                        corefDict[key] = '(' + str(corefClusterId) + ')'
                else:
                    keyStart = str(span[0])
                    keyEnd = str(span[1] - 1)
                    if corefDict.get(keyStart) is not None:
                        corefDict[keyStart] = corefDict[keyStart] + '|(' + str(corefClusterId)
                    else:
                        corefDict[keyStart] = '(' + str(corefClusterId)
                    if corefDict.get(keyEnd) is not None:
                        corefDict[keyEnd] = corefDict[keyEnd] + '|' + str(corefClusterId) + ')'
                    else:
                        corefDict[keyEnd] = str(corefClusterId) + ')'
            corefClusterId += 1
        corefDictsDocument.append(corefDict)
    return corefDictsDocument

Creates dataframes with coref annotations, speakers, tokenID:s and partID:s
for each of the document parts

In [9]:
def createDataframes(pdocs, corefAnnotDicts, dictSpeakerDocs):

    dataFramesDocument = []

    for i, (corefAnnot, speakersTok) in enumerate(zip(corefAnnotDicts, dictSpeakerDocs)):

        df2 = pd.DataFrame({'DocName': 'ep-07-05-23-006',
                        'Part': pd.Series(i, index = list(range(len(pdocs[i]))), dtype='int32'),
                        'TokenID': 1,
                        'Text': '',
                        'Tag': '',
                        'Speaker': '',
                        'Corefs': '-',})
        tokenIdDict = {}

        for sent in pdocs[i].sents:
            idTok = 0
            for token in sent:
                key = str(token.i)
                tokenIdDict[key] = idTok
                if token.text != '\n':
                    idTok += 1

        for j in range(len(pdocs[i])):
            key = str(j)
            df2.at[j, 'TokenID'] = tokenIdDict[key]

        for j, token in enumerate(pdocs[i]):
            df2.at[j, 'Text'] = token.text
            df2.at[j, 'Tag'] = token.tag_

        tokIndex = 0
        tokStartIndex = 0
        for sent in pdocs[i].sents:
            for j, token in enumerate(sent):
                if speakersTok.get(tokIndex - j) is not None:
                    df2.at[tokIndex, 'Speaker'] = speakersTok[tokIndex - j]
                tokIndex += 1

        for j in range(len(pdocs[i])):
            key = str(j)
            if corefAnnot.get(key) is not None:
                df2.at[j, 'Corefs'] = corefAnnot[key]

        df2 = df2[df2.Text != '\n'].reset_index(drop=True)

        dataFramesDocument.append(df2)
    return dataFramesDocument

In [10]:
sweParts = readIntoParts('../Data/Datasets/Europarl/Documents/ep-07-05-23-006/text/ep-07-05-23-006-sv.txt')

tokenizer = spacy.load('../Models/SwedishModel', disable=['tagger', 'parser', 'ner'])
nlp = loadModel('../Models/SwedishModel')

docs = processDocs(nlp, sweParts)

corefSpansSweDocument = translateCorefs(nlp, docs)

dictSpeakerTokensDoc = extractSpeakers(tokenizer, sweParts, '../Data/Datasets/Europarl/Documents/ep-07-05-23-006/xml/ep-07-05-23-006-sv.xml', '../Data/Datasets/Europarl/Documents/ep-07-05-23-006/xml/ep-07-05-23-006-links.xml')

corefAnnotDict = createCorefAnnotDict(corefSpansSweDocument)

dataframes = createDataframes(docs, corefAnnotDict, dictSpeakerTokensDoc)

Writes the dataframes to a single v4.gold.conll file

In [11]:
with open('conll/ep-07-05-23-006-Allen.v4_gold_conll', 'w') as file:
    for df in dataframes:
        file.write('#begin document (ep-07-05-23-006); part ' + str(df.iat[0, 1]) + '\n')
        for i in range(len(df)):
            a = df.loc[i, 'TokenID']
            if df.loc[i, 'TokenID'] == 0:
                file.write('\n')
            file.write(df.loc[i, 'DocName'] + '\t\t\t')
            file.write(str(df.loc[i, 'Part']) + '\t\t\t')
            file.write(str(df.loc[i, 'TokenID']) + '\t\t\t')
            file.write('{:40s}'.format(df.loc[i, 'Text']))
            file.write('{:40s}'.format(df.loc[i, 'Tag']))
            file.write('-' + '\t\t\t' + '-' + '\t\t\t' + '-' + '\t\t\t' + '-' + '\t\t\t')
            file.write('{:40s}'.format(df.loc[i, 'Speaker']))
            file.write('-' + '\t\t\t')
            file.write(df.loc[i, 'Corefs'])
            file.write('\n')

        file.write('\n' + '#end document' + '\n')



Statistics for how many coref-annotations were translated from the original
English document.

In [12]:
numOfCorefSwe = 0

for dok in corefSpansSweDocument:
    for cluster in dok:
        for coref in cluster:
            numOfCorefSwe +=1

numOfCorefEng = 0

for dok in corefSpansDocument:
    for cluster in dok:
        for coref in cluster:
            numOfCorefEng +=1

numberOfNoAli = 0

for dok in corefSpansDocument:
    for cluster in dok:
        for coref in cluster:
            if len(cluster) == 2 and coref[0] == -1:
                numberOfNoAli += 2
            elif coref[0] == -1:
                numberOfNoAli +=1

transCorefs = numOfCorefSwe / numOfCorefEng
transCorefInclAliErr = numOfCorefSwe / numberOfNoAli