In [None]:
!pip install spacy[transformers]
import spacy_transformers

# Data preprocessing : CoNLL to SpaCy

In [None]:
import pandas as pd
import re
def load_data(path):
    tokens = []
    pos = []
    tags = []
    
    with open(path, 'r', encoding = 'utf-8') as f:
        lines = [line for line in f]
    
    for i in lines:
        if i == '\n':
            tokens.append(' ')
            pos.append(' ')
            tags.append(' ')
        else:
            temp = re.split('\t',i)
            tokens.append(temp[0])
            pos.append(temp[-2])
            tags.append(re.sub('\n','',temp[-1]))

    test = pd.DataFrame({'tokens':tokens,
                         'pos':pos,
                        'ner_tags':tags})
    
    return test

def token_to_sen(test):
    sen = []
    tag = []

    temp_sen = []
    temp_tag = []


    for i in range(len(test)):

        token = test['tokens'][i]
        pos = test['pos'][i]
        tags = test['ner_tags'][i]

        if pos != '':
          if token != '':
            temp_sen.append(token)
            temp_tag.append(tags)

        else:
          # if len(temp_sen) > 1:
            sen.append(temp_sen)
            tag.append(temp_tag)

            temp_sen = []
            temp_tag = []

    tokenized = pd.DataFrame({'tokens':sen,
                              'ner_tags':tag})
    
    return tokenized


test = load_data('deu_testb.txt')
tokenized = token_to_sen(test)
tokenized

In [None]:
def to_spacy_preprocess(tokenized):
    spacy_format = []

    for i in range(len(tokenized)):

        #get position of all tokens in the sentences

        position=[]
        count = 0

        for token in tokenized['tokens'][i]:

            position.append((token,count,count+len(token)))
            count = count+len(token)+1



        entities = []
        entity_candidate = []
        tag_type = 'none'

        for j in range(len(tokenized['ner_tags'][i])):
            tag = tokenized['ner_tags'][i][j]


            if len(entity_candidate) == 0:
                if tag != 'O':
                    tag_type = tag[2:]
                    start = position[j][1]
                    end = position[j][2]

                    entity_candidate.append([start,end,tag_type])



                    if j < len(tokenized['ner_tags'][i])-1:

                        next_tag = tokenized['ner_tags'][i][j+1]

                        if tag_type not in next_tag:
                            entities.append((entity_candidate[-1][0],entity_candidate[-1][1],entity_candidate[-1][2]))
                            entity_candidate = []
                            tag_type = 'none'

                    else:
                        entities.append((entity_candidate[-1][0],entity_candidate[-1][1],entity_candidate[-1][2]))
                        entity_candidate = []
                        tag_type = 'none'                    

            else:
                
                if tag_type in tag:
                    entity_candidate[-1][1] = position[j][2]                

                else:
                    entities.append((entity_candidate[-1][0],entity_candidate[-1][1],entity_candidate[-1][2]))
                    entity_candidate = []
                    tag_type = 'none'


        full_text = ' '.join(tokenized['tokens'][i])
        spacy_format.append([full_text,{'entities':entities}])

    return spacy_format

docs = to_spacy_preprocess(tokenized)

In [43]:
docs[1:5]

[['Parlament ohne Wiederrede', {'entities': []}],
 ['RODGAU .', {'entities': [(0, 6, 'LOC')]}],
 ['Die Sitzung der Stadtverordnetenversammlung von Freitag , 21. August 1992 , im Rathaus zu Jügesheim wird in die Geschichte der 15 Jahre jungen Stadt Rodgau als die erste ohne jede Widerrede eingehen .',
  {'entities': [(16, 43, 'ORG'), (79, 86, 'LOC'), (143, 155, 'LOC')]}],
 ['Die sechs Abgeordneten der Grünen , die seit Bildung einer Großen Koalition von Christ- und Sozialdemokraten die Opposition darstellen , hatten sich beim Vorsteher Rainer Bergert für ihr Fehlen entschuldigen lassen .',
  {'entities': [(27, 33, 'ORG'),
    (80, 87, 'ORG'),
    (92, 108, 'ORG'),
    (164, 178, 'PER')]}]]

# Functions for evaluation
ref: https://github.com/wjbmattingly/spacy_tutorials_3x/blob/main/02_02_formal_test.ipynb

In [None]:
import spacy
from spacy.training import offsets_to_biluo_tags

nlp = spacy.load("output/model-best")

In [71]:
def get_cleaned_label(label: str):
    if "-" in label:
        return label.split("-")[1]
    else:
        return label
    
def create_total_target_vector(docs):
    target_vector = []
    for doc in docs:
        new = nlp.make_doc(doc[0])
        entities = doc[1]["entities"]
        bilou_entities = offsets_to_biluo_tags(new, entities)
        final = []
        for item in bilou_entities:
            final.append(get_cleaned_label(item))
        target_vector.extend(final)
    return target_vector

In [32]:
def create_prediction_vector(text):
    return [get_cleaned_label(prediction) for prediction in get_all_ner_predictions(text)]

def create_total_prediction_vector(docs: list):
    prediction_vector = []
    for doc in docs:
        prediction_vector.extend(create_prediction_vector(doc[0]))
    return prediction_vector

def get_all_ner_predictions(text):
    doc = nlp(text)
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    bilou_entities = offsets_to_biluo_tags(doc, entities)
    return bilou_entities

In [51]:
def get_model_labels():
    return sorted(['O', 'ORG'])
def get_dataset_labels():
    return sorted(set(create_total_target_vector(docs)))

In [None]:
def get_dataset_labels():
    return sorted(set(create_total_target_vector(docs)))

In [78]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

def generate_confusion_matrix(docs): 
    # classes = sorted(set(create_total_target_vector(docs)))

    # print('-----------True Classes-------------')
    y_true = create_total_target_vector(docs)
    # print('\n\n\n-----------Predicted Classes-------------')
    y_pred = create_total_prediction_vector(docs)
    # print (y_true)
    # print (y_pred)
    labels = ['O', 'PER', 'LOC', "ORG", "MISC"]
    cm = confusion_matrix(y_true, y_pred, labels = labels)
    df = pd.DataFrame(cm, index = labels, columns = labels)
    print(classification_report(y_true, y_pred, target_names=labels))
    return df

# generate_confusion_matrix(docs[:50])   

# Result

In [79]:
generate_confusion_matrix(docs)  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.00      0.00      0.00      1275
         PER       0.00      0.00      0.00       777
         LOC       0.90      1.00      0.95     45907
         ORG       0.54      0.02      0.04      1259
        MISC       0.00      0.00      0.00      1784

    accuracy                           0.90     51002
   macro avg       0.29      0.20      0.20     51002
weighted avg       0.82      0.90      0.85     51002



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,O,PER,LOC,ORG,MISC
O,45900,0,0,7,0
PER,1784,0,0,0,0
LOC,1259,0,0,16,0
ORG,1230,0,0,29,0
MISC,775,0,0,2,0


In [2]:
!python -m spacy evaluate /content/drive/MyDrive/output_roberta/model-last conll_testb_spacy.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
tcmalloc: large alloc 1134411776 bytes == 0x78ab0000 @  0x7f119332d1e7 0x4d3280 0x5de162 0x60fd9f 0x5a9e9b 0x46eb22 0x616b6b 0x4f7ada 0x49ca7c 0x55e858 0x5d7cf1 0x49ec69 0x5d7c18 0x49caa1 0x4fe993 0x49ced5 0x55e571 0x5d7cf1 0x49ec69 0x5d7c18 0x49caa1 0x4fe993 0x49ced5 0x55e858 0x5d7cf1 0x5d77c6 0x561051 0x55e858 0x5d7cf1 0x49caa1 0x55e858
[1m

TOK     100.00
NER P   57.45 
NER R   0.74  
NER F   1.47  
SPEED   2768  

[1m

           P      R      F
LOC     0.00   0.00   0.00
ORG    57.45   3.50   6.59
PER     0.00   0.00   0.00
MISC    0.00   0.00   0.00

