In [1]:
import pickle
import requests
import math
import sklearn.metrics
import pandas as pd

In [2]:
# from https://cloud.gate.ac.uk/shopfront/displayItem/tagger-pos-pl-maxent1 - "A POS tagger for pl / Polish using the Universal 
# Dependencies POS tagset.
# This tagger is based on a simple maximum entropy model trained on the corpus from the universal dependencies collection using 
# the GATE Learning Framework plugin.
# The model is trained on all available corpora, except the test corpus. Evaluation on the test set gives 0.9456 accuracy. 
# Accuracy on out-of-vocabulary words (words not seen in the trainin set) is 0.9122 (case-sensitive) / 0.9174 (not 
# case-sensitive)."

URL = 'https://cloud-api.gate.ac.uk/process/tagger-pos-pl-maxent1'
PARAMS = {
    'Content-Type': 'text/plain',
    'Accept': 'application/json'
}
file = 'memoirs_annotated_3k.txt'

In [3]:
def retrieve_text(filename):
    with open(filename) as f:
        lines = f.readlines()
        
        all_lines = []
        for line in lines:
            all_lines.append(line.strip())
        return all_lines

In [4]:
text = retrieve_text(file)

In [5]:
class OriginalAnnotations:
    def __init__(self, data):
        self.tokens = []
        self.gold_standard = []
        
        for sentence in text:
            sentence.strip()
            sentence = sentence.split(" ")
            for annotated_token in sentence:
                if annotated_token[0] == '[':
                    annotated_token = annotated_token[1:]
                if annotated_token[-1] == ']':
                    annotated_token = annotated_token[:-1]
                split_token = annotated_token.split('_')
                self.tokens.append(split_token[0])
                self.gold_standard.append(split_token[1])

In [6]:
original_text = OriginalAnnotations(text)

In [7]:
class TaggerAnnotations:
    def __init__(self, url, headers, data):
        
        self.processed_annotations = []
        self.only_annotations = []
        
        if len(data) > 1000:
            prev_i = 0
            for i in range(1, math.floor(len(data)/1000)):
                subset = data[prev_i*1000:i*1000]
                prev_i = i
                subset = ' '.join(subset)
                r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
                annotations = r.json()
                for entry in annotations['entities']['Token']:
                    self.__retrieve_anns(entry)
                    
            subset = data[prev_i*1000:]
            subset = ' '.join(subset)
            r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
            annotations = r.json()
            for entry in annotations['entities']['Token']:
                self.__retrieve_anns(entry)        
            
        
        else:
            data = ' '.join(data)

            r = requests.post(url=url, data=data.encode('utf-8'), headers=headers) 
            self.annotations = r.json()

            for entry in self.annotations['entities']['Token']:
                self.__retrieve_anns(entry)

    def print_annotations(self):
        for entry in self.processed_annotations:
            print(entry[0] + ' | ' + entry[1] + ' | ' + str(entry[2]))
            
    def __retrieve_anns(self, entry):
        word = entry['string']
        pos = entry['upos']
        confidence = entry['LF_confidence']
        target_list = entry['LF_target_list']
        confidence_list = entry['LF_confidence_list']

        annotation = [word, pos, confidence, target_list, confidence_list]
        self.processed_annotations.append(annotation)
        self.only_annotations.append(pos)
        

In [8]:
tagger_anns = TaggerAnnotations(URL, PARAMS, original_text.tokens)

In [9]:
for i in range(0,len(tagger_anns.processed_annotations)):
    if tagger_anns.processed_annotations[i][0] != original_text.tokens[i]:
        print(f'mismatch at {i}: {tagger_anns.processed_annotations[i][0]} and {original_text.tokens[i]}')
        print()

In [10]:
def get_measures(gold_standard, predictions):
    labels = list(set(gold_standard))
    print('MEASURES:')
    print(f'Accuracy: {sklearn.metrics.accuracy_score(gold_standard, predictions)}')
    print(f'Matthew\'s Correlation Coefficient: {sklearn.metrics.matthews_corrcoef(gold_standard, predictions)}')
    print()
    print('MEASURES PER CLASS:')
    precision = sklearn.metrics.precision_score(gold_standard, predictions, average=None, labels=labels)
    print('Precision:')
    for i in range(0,len(labels)):
        print(f'\t{labels[i]}: {precision[i]}')
    recall = sklearn.metrics.recall_score(gold_standard, predictions, average=None, labels=labels)
    print('Recall:')
    for i in range(0,len(labels)):
        print(f'\t{labels[i]}: {recall[i]}')
    print()
    

In [11]:
get_measures(original_text.gold_standard, tagger_anns.only_annotations)

MEASURES:
Accuracy: 0.8360856269113149
Matthew's Correlation Coefficient: 0.8139661161016065

MEASURES PER CLASS:
Precision:
	ADV: 0.6987951807228916
	NUM: 0.7368421052631579
	AUX: 0.7692307692307693
	SCONJ: 0.782608695652174
	PUNCT: 1.0
	NOUN: 0.8493506493506493
	PROPN: 0.659016393442623
	AD: 0.0
	VEBR: 0.0
	PART: 0.8222222222222222
	DET: 0.9538461538461539
	CCONJ: 0.9751552795031055
	PRON: 0.8289473684210527
	ADP: 0.9507246376811594
	VERB: 0.8416075650118203
	X: 0.6451612903225806
	ADJ: 0.6052631578947368
Recall:
	ADV: 0.7073170731707317
	NUM: 0.6511627906976745
	AUX: 0.746268656716418
	SCONJ: 0.5294117647058824
	PUNCT: 1.0
	NOUN: 0.7936893203883495
	PROPN: 0.9305555555555556
	AD: 0.0
	VEBR: 0.0
	PART: 0.5967741935483871
	DET: 0.40789473684210525
	CCONJ: 0.9515151515151515
	PRON: 0.875
	ADP: 0.9704142011834319
	VERB: 0.9035532994923858
	X: 0.5263157894736842
	ADJ: 0.773109243697479



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
def get_problematic_anns(gold_standard, predictions, tokens):
    problematic = []
    for i in range(0, len(tokens)):
        
        if gold_standard[i] != predictions[i]:
            if i != 0:
                preceding = tokens[i-1]
            else:
                preceding = ''
                
            if i != len(tokens)-1:
                succeeding = tokens[i+1]
            else:
                succeeding = ''
                
            problematic.append((tokens[i], ' '.join([preceding, tokens[i], succeeding]), gold_standard[i], predictions[i]))
            
    problematic_frame = pd.DataFrame(problematic, columns=['Token', 'Context', 'Gold Standard', 'Prediction'])
    
    return problematic_frame

In [13]:
frame = get_problematic_anns(original_text.gold_standard, tagger_anns.only_annotations, original_text.tokens)

In [14]:
pd.set_option('display.max_rows', None)
frame

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,syn,Rypnin syn komornika,NOUN,ADP
1,Komornika,Jana Komornika ziemi,NOUN,PROPN
2,ziemi,Komornika ziemi Dobrrzyńskiej,NOUN,VERB
3,właścicieli,Pinińskich właścicieli Dóbr,NOUN,VERB
4,Dóbr,właścicieli Dóbr Strużewo,NOUN,PROPN
5,śp,że śp Dziad,X,NOUN
6,Dziad,śp Dziad mój,NOUN,PROPN
7,mój,Dziad mój był,DET,VERB
8,najmłodszy,był najmłodszy i,ADJ,NOUN
9,piąty,i piąty z,ADJ,NOUN
