In [1]:
import spacy
import pickle
import neuralcoref.train.document as dc
from xml.etree import ElementTree
import pandas as pd

In [2]:
with open('pickleObjects/corefSpansObject', 'rb') as f:
    corefSpansDocument = pickle.load(f)

In [3]:
sweParts = []
sweCorp = ''
with open('../Data/Datasets/Europarl/Documents/ep-11-06-23-004/text/ep-11-06-23-004-sv.txt') as f:
    for index, line in enumerate(f):
        sweCorp = sweCorp + line
        if (index + 1) % 18 == 0:
            sweParts.append(sweCorp)
            sweCorp = ''

In [4]:
nlp = spacy.load('../Models/SwedishModel')
docs = []
for part in sweParts:
    doc = nlp(part)
    docs.append(doc)

In [5]:
corefSpansSweDocument = []

for corefSpans, doc in zip(corefSpansDocument, docs):
    numOfSingCoref = 0
    corefSpansSwe = []
    for clusterSub in corefSpans:
        corefClusterSwe = []

        for ment in clusterSub:
            corefSwe = []
            startOfLine = 0
            endOfLine = 0
            lineText = ''
            mentionText = ''
            if ment[0] != -1:
                startOfLine = ment[0]
                endOfLine = ment[-1]
                while doc[startOfLine].text != '\n' and startOfLine != 0:
                    startOfLine -= 1
                startOfLine += 1
                while doc[endOfLine].text != '\n':
                    endOfLine += 1

                for tok in range(startOfLine, endOfLine):
                    lineText = lineText + doc[tok].text + ' '
                lineText.rstrip()
            else:
                continue

            #mentionSpanDoc = nlp(mentionText)
            lineDoc = nlp(lineText)

            #mentionsInSpan = dc.extract_mentions_spans(mentionSpanDoc, [])
            mentionsInLine = dc.extract_mentions_spans(lineDoc, [])

            mentionsInSpan = []

            for span in mentionsInLine:
                if (span.start + startOfLine) >= ment[0] and (span.end + startOfLine) <= ment[-1]:
                    mentionsInSpan.append(span)

            if len(mentionsInSpan) == 0:
                continue

            invalidCoref = False

            """for span in mentionsInLine:
                if (span.end + startOfLine) > mention[1] > (span.start + startOfLine) > mention[0] or (span.start + startOfLine) < mention[0] < (span.end + startOfLine) < mention[1]:
                    invalidCoref = True"""

            maxLength = 0
            maxSpan = 0
            for subSpan in mentionsInSpan:
                if subSpan.end - subSpan.start > maxLength:
                    maxLength = subSpan.end - subSpan.start
                    maxSpan = subSpan

            """for span in mentionsInSpan:
                if span.start < maxSpan.start or span.end > maxSpan.end:
                    invalidCoref = True"""

            if invalidCoref:
                continue
            elif not invalidCoref:
                corefSwe.append(maxSpan.start + startOfLine)
                corefSwe.append(maxSpan.end + startOfLine)
                corefClusterSwe.append(corefSwe)

        if len(corefClusterSwe) > 1:
            res = list(set(tuple(sorted(sub)) for sub in corefClusterSwe))
            corefSpansSwe.append(res)
        else:
            numOfSingCoref += 1

    corefSpansSweDocument.append(corefSpansSwe)


In [6]:
dictSpeakerSents = {}
tree = ElementTree.parse('../Data/Datasets/Europarl/Documents/ep-11-06-23-004/xml/ep-11-06-23-004-sv.xml')
root = tree.getroot()
for speaker in root.iter('SPEAKER'):
    for sent in speaker.iter('s'):
        dictSpeakerSents[sent.attrib['id']] = speaker.attrib['NAME'].replace(' ', '_')

dictSpeakerLines = {}
tree = ElementTree.parse('../Data/Datasets/Europarl/Documents/ep-11-06-23-004/xml/ep-11-06-23-004-links.xml')
root = tree.getroot()
lineNumber = 0

for link in root.iter('link'):
    sweLinks = link.attrib['xtargets'].split(';')[1].split()
    noSpeakerInLine = False

    for sentNumber in sweLinks:
        if sentNumber not in dictSpeakerSents:
            noSpeakerInLine = True

    for sentNumber in sweLinks:
        if not noSpeakerInLine:
            dictSpeakerLines[lineNumber] = dictSpeakerSents[sentNumber]

    lineNumber += 1

dictSpeakerTokensDocument = []
for index1, swePart in enumerate(sweParts):
    dictSpeakerTokens = {}
    tokenIndex = 0

    sweLines = swePart.split('\n')
    for index2, line in enumerate(sweLines):
        sweLines[index2] = line + '\n'
    sweLines.pop()

    for lineIndex, line in enumerate(sweLines):
        lineDoc = nlp(line)
        key = index1 * 18 + lineIndex
        if key in dictSpeakerLines:
            for i, token in enumerate(lineDoc):
                dictSpeakerTokens[tokenIndex + i] = dictSpeakerLines[key]
        else:
            for i, token in enumerate(lineDoc):
                dictSpeakerTokens[tokenIndex + i] = None
        tokenIndex += len(lineDoc)

    dictSpeakerTokensDocument.append(dictSpeakerTokens)

In [7]:
corefDictsDocument = []

for corefSpans in corefSpansSweDocument:
    corefClusterId = 1
    corefDict = {}
    for cluster in corefSpans:
        for span in cluster:
            if span[1] - span[0] == 1:
                key = str(span[0])
                if corefDict.get(key) is not None:
                    corefDict[key] = corefDict[key] + '|(' + str(corefClusterId) + ')'
                else:
                    corefDict[key] = '(' + str(corefClusterId) + ')'
            else:
                keyStart = str(span[0])
                keyEnd = str(span[1] - 1)
                if corefDict.get(keyStart) is not None:
                    corefDict[keyStart] = corefDict[keyStart] + '|(' + str(corefClusterId)
                else:
                    corefDict[keyStart] = '(' + str(corefClusterId)
                if corefDict.get(keyEnd) is not None:
                    corefDict[keyEnd] = corefDict[keyEnd] + '|' + str(corefClusterId) + ')'
                else:
                    corefDict[keyEnd] = str(corefClusterId) + ')'
        corefClusterId += 1
    corefDictsDocument.append(corefDict)


In [8]:
dataFramesDocument = []

for i, (corefAnnot, speakersTok) in enumerate(zip(corefDictsDocument, dictSpeakerTokensDocument)):

    df2 = pd.DataFrame({'DocName': 'ep-11-06-23-004',
                    'Part': pd.Series(i, index = list(range(len(docs[i]))), dtype='int32'),
                    'TokenID': 1,
                    'Text': '',
                    'Tag': '',
                    'Speaker': '',
                    'Corefs': '-',})
    tokenIdDict = {}

    for sent in docs[i].sents:
        idTok = 0
        for token in sent:
            key = str(token.i)
            tokenIdDict[key] = idTok
            if token.text != '\n':
                idTok += 1

    for j in range(len(docs[i])):
        key = str(j)
        df2.at[j, 'TokenID'] = tokenIdDict[key]

    for j, token in enumerate(docs[i]):
        df2.at[j, 'Text'] = token.text
        df2.at[j, 'Tag'] = token.tag_

    tokIndex = 0
    tokStartIndex = 0
    for sent in docs[i].sents:
        for j, token in enumerate(sent):
            if speakersTok.get(tokIndex - j) is not None:
                df2.at[tokIndex, 'Speaker'] = speakersTok[tokIndex - j]
            tokIndex += 1

    for j in range(len(docs[i])):
        key = str(j)
        if corefAnnot.get(key) is not None:
            df2.at[j, 'Corefs'] = corefAnnot[key]

    df2 = df2[df2.Text != '\n'].reset_index(drop=True)

    dataFramesDocument.append(df2)

In [9]:
with open('conll/ep-11-06-23-004.v4_gold_conll', 'w') as file:
    for df in dataFramesDocument:
        file.write('#begin document (ep-11-06-23-004); part ' + str(df.iat[0, 1]) + '\n')
        for i in range(len(df)):
            a = df.loc[i, 'TokenID']
            if df.loc[i, 'TokenID'] == 0:
                file.write('\n')
            file.write(df.loc[i, 'DocName'] + '\t\t\t')
            file.write(str(df.loc[i, 'Part']) + '\t\t\t')
            file.write(str(df.loc[i, 'TokenID']) + '\t\t\t')
            file.write('{:40s}'.format(df.loc[i, 'Text']))
            file.write('{:40s}'.format(df.loc[i, 'Tag']))
            file.write('-' + '\t\t\t' + '-' + '\t\t\t' + '-' + '\t\t\t' + '-' + '\t\t\t')
            file.write('{:40s}'.format(df.loc[i, 'Speaker']))
            file.write('-' + '\t\t\t')
            file.write(df.loc[i, 'Corefs'])
            file.write('\n')

        file.write('\n' + '#end document' + '\n')
