# Evaluating a Modern Polish POS-Tagger on Historical Data
### by Maria Irena Szawerna

### IMPORTS, VARIABLES, AND PARAMETERS

In [1]:
import pickle
import requests
import math
import sklearn.metrics
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_rows', None)

In [2]:
# from https://cloud.gate.ac.uk/shopfront/displayItem/tagger-pos-pl-maxent1 - "A POS tagger for pl / Polish using the Universal 
# Dependencies POS tagset.
# instructions on how to use the API from https://cloud.gate.ac.uk/info/help/online-api.html

URL = 'https://cloud-api.gate.ac.uk/process/tagger-pos-pl-maxent1'
PARAMS = {
    'Content-Type': 'text/plain',
    'Accept': 'application/json'
}
file = 'memoirs_annotated_3k.txt'

### FUNCTIONS AND CLASSES

In [3]:
def retrieve_text(filename: str):
    '''A function intended for opening and reading a text file, with lines being stripped of leading and trailing whitespace.
    
    Args:
        filename (str): The name of the file or path to the file that is to be opened.
        
    Returns:
        A list of strings representing the lines in the file, stripped of whitespace on the edges.
    '''
    with open(filename) as f:
        lines = f.readlines()
        
        all_lines = []
        for line in lines:
            all_lines.append(line.strip())
        return all_lines

In [4]:
class OriginalAnnotations:
    '''A class intended to process and store the tokens and their respective annotations from the original text.
    
    Attributes:
        tokens (list[str]): A list of all the tokens (without annotation) in the original data. Every element of the list is a string.
        gold_standard (list[str]): A list of all the original annotations (without tokens). Every element is a string.
    '''
    def __init__(self, data: list, lowercase: bool = False):
        '''The __init__ method of the class.
        Constructs the token and gold_standard lists.
        
        Args:
            data (list[str]): A list of annotated sentences. Every element of the list is a string representing the whole sentence.
            lowercase (bool): Determines whether the tokens should be lowercased or if original capitalization should be retained.
        '''
        self.tokens = []
        self.gold_standard = []
        
        for sentence in text:
            sentence.strip()
            sentence = sentence.split(" ")
            for annotated_token in sentence:
                if annotated_token[0] == '[':
                    annotated_token = annotated_token[1:]
                if annotated_token[-1] == ']':
                    annotated_token = annotated_token[:-1]
                split_token = annotated_token.split('_')
                if not lowercase:
                    self.tokens.append(split_token[0])
                else:
                    self.tokens.append(split_token[0].lower())
                self.gold_standard.append(split_token[1])
                
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of self.tokens, which should be identical to the length of self.gold_standard.
        '''
        return len(self.tokens)
        
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and annotation.
        '''
        token = self.tokens[index]
        annotation = self.gold_standard[index]
        item = '_'.join([token, annotation])
            
        return item

In [5]:
class TaggerAnnotations:
    '''A class intended for retrieving, processing, and storing the annotations from an online tagger.
    
    Attributes:
        processed_annotations (list[list]): A list of lists, every element of which represents a number of values returned by the
            tagger. The values include the token, the POS tag, the confidence thereof, a list of possible targets and a list
            of those confidences.
        only_annotations (list[str]): A list containing only the POS tags obtained from the tagger.    
    '''
    def __init__(self, url: str, headers: dict, data: list):
        '''The __init__ method of the class.
        Retrieves the tagger annotations (in batches, if needed), constructs processed_annotations and only_annotations using
        the __retrieve_anns method.
        
        Args:
            url (str): The URL to which the query should be made.
            headers (dict): The parameters that the query requires.
            data (list[str]): The original, unannotated data represented as a list of individual tokens (not sentences).        
        '''
        self.processed_annotations = []
        self.only_annotations = []
        
        if len(data) > 1000:  # this is required due to the tagger bottleneck
            prev_i = 0
            for i in range(1, math.floor(len(data)/1000)):
                subset = data[prev_i*1000:i*1000]
                prev_i = i
                subset = ' '.join(subset)
                r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
                annotations = r.json()
                for entry in tqdm(annotations['entities']['Token'], desc='Loading annotations...'):
                    self.__retrieve_anns(entry)
                    
            subset = data[prev_i*1000:]
            subset = ' '.join(subset)
            r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
            annotations = r.json()
            for entry in tqdm(annotations['entities']['Token'], desc='Loading annotations...'):
                self.__retrieve_anns(entry)        
            
        
        else:
            data = ' '.join(data)

            r = requests.post(url=url, data=data.encode('utf-8'), headers=headers) 
            self.annotations = r.json()

            for entry in tqdm(annotations['entities']['Token'], desc='Loading annotations...'):
                self.__retrieve_anns(entry)

    def print_annotations(self):
        '''A method of the class which prints out all words with their annotation and the confidence thereof.
        '''
        for entry in self.processed_annotations:
            print(entry[0] + ' | ' + entry[1] + ' | ' + str(entry[2]))
            
    def __retrieve_anns(self, entry: dict):
        '''A method of the class which reads the data returned by the tagger and stores the relevant elements in appropriate
        lists.
        
        Args:
            entry (dict): An entry returned by the tagger.
        '''
        word = entry['string']
        pos = entry['upos']
        confidence = entry['LF_confidence']
        target_list = entry['LF_target_list']
        confidence_list = entry['LF_confidence_list']

        annotation = [word, pos, confidence, target_list, confidence_list]
        self.processed_annotations.append(annotation)
        self.only_annotations.append(pos)
        
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of self.only_annotations, which should be identical to the length of self.processed_annotations.
        '''
        return len(self.only_annotations)
        
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and the tagger annotation.
        '''
        full_ann = self.processed_annotations[index]
        token = full_ann[0]
        annotation = full_ann[1]
        item = '_'.join([token, annotation])
            
        return item
        

In [6]:
def get_measures(gold_standard: list, predictions: list):
    '''A function intended for retrieving a selection of evaluation measures for comparing the gold standard and the tagger
    annotations. The measures are printed out and include accuracy, Matthew's Correlation Coefficient, per-class precision 
    and recall. These measures are calculated using functions from sklearn.
    
    Args:
        gold_standard (list[str]): A list of gold standard labels.
        predictions (list[str]): A list of predicted labels.
    '''
    labels = sorted(list(set(gold_standard)))
    print('MEASURES:')
    print(f'Accuracy: {sklearn.metrics.accuracy_score(gold_standard, predictions)}')
    print(f'Matthew\'s Correlation Coefficient: {sklearn.metrics.matthews_corrcoef(gold_standard, predictions)}')
    print()
    print('MEASURES PER CLASS:')
    precision = sklearn.metrics.precision_score(gold_standard, predictions, average=None, labels=labels)
    print('Precision:')
    for i in range(0,len(labels)):
        print(f'\t{labels[i]}: {precision[i]}')
    recall = sklearn.metrics.recall_score(gold_standard, predictions, average=None, labels=labels)
    print('Recall:')
    for i in range(0,len(labels)):
        print(f'\t{labels[i]}: {recall[i]}')
    print()
    

In [7]:
def get_problematic_anns(
    gold_standard: list, 
    predictions: list, 
    processed_annotations: list, 
    tokens: list
):
    '''A function intended for collecting the wrong annotations from the tagger and storing them in a DataFrame with the 
    original token, context, gold standard, prediction, and the confidence of that prediction.
    
    Args:
        gold_standard (list[str]): A list of gold standard tags represented as strings.
        predictions (list[str]): A list of predicted tags represented as strings.
        processed_annotations (list[list]): A list of lists where every element represents information about the annotation as
            obtained from the tagger.
        tokens (list[str]): A list of the original tokens.
        
    Returns:
        A DataFrame containing the original token, context, gold standard, prediction, and the confidence of that prediction 
        for every mismatched prediction and gold standard.
    '''
    problematic = []
    for i in range(0, len(tokens)):
        
        if gold_standard[i] != predictions[i]:
            if i != 0:
                preceding = tokens[i-1]
            else:
                preceding = ''
                
            if i != len(tokens)-1:
                succeeding = tokens[i+1]
            else:
                succeeding = ''
                
            problematic.append(
                (tokens[i], 
                 ' '.join([preceding, tokens[i], succeeding]), 
                 gold_standard[i], 
                 predictions[i], 
                 processed_annotations[i][2])
            )
            
    problematic_frame = pd.DataFrame(problematic, columns=['Token', 'Context', 'Gold Standard', 'Prediction', 'Confidence'])
    
    return problematic_frame

### RESULTS (non-lowercase)

In [8]:
# retrieving the lines from the annotated file
text = retrieve_text(file)

In [9]:
# creating an OriginalAnnotations object to store the tokens and original annotations
original_text = OriginalAnnotations(text)

In [10]:
# creating a TaggerAnnotations object to retrieve and store tagger annotations
tagger_anns = TaggerAnnotations(URL, PARAMS, original_text.tokens)

Loading annotations...: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 573619.26it/s]
Loading annotations...: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 488505.01it/s]
Loading annotations...: 100%|███████████████████████████████████████████████████| 1270/1270 [00:00<00:00, 507552.75it/s]


In [11]:
# sanity check
for i in range(0,len(tagger_anns.processed_annotations)):
    if tagger_anns.processed_annotations[i][0] != original_text.tokens[i]:
        print(f'mismatch at {i}: {tagger_anns.processed_annotations[i][0]} and {original_text.tokens[i]}')
        print()

In [12]:
# evaluating the tagger annotations
get_measures(original_text.gold_standard, tagger_anns.only_annotations)

MEASURES:
Accuracy: 0.8360856269113149
Matthew's Correlation Coefficient: 0.8139543271209075

MEASURES PER CLASS:
Precision:
	ADJ: 0.6052631578947368
	ADP: 0.9507246376811594
	ADV: 0.6987951807228916
	AUX: 0.7692307692307693
	CCONJ: 0.9751552795031055
	DET: 0.9538461538461539
	NOUN: 0.8493506493506493
	NUM: 0.7368421052631579
	PART: 0.8222222222222222
	PRON: 0.8289473684210527
	PROPN: 0.659016393442623
	PUNCT: 1.0
	SCONJ: 0.782608695652174
	VEBR: 0.0
	VERB: 0.8416075650118203
	X: 0.6451612903225806
Recall:
	ADJ: 0.7698744769874477
	ADP: 0.9704142011834319
	ADV: 0.7073170731707317
	AUX: 0.746268656716418
	CCONJ: 0.9515151515151515
	DET: 0.40789473684210525
	NOUN: 0.7936893203883495
	NUM: 0.6511627906976745
	PART: 0.5967741935483871
	PRON: 0.875
	PROPN: 0.9305555555555556
	PUNCT: 1.0
	SCONJ: 0.5294117647058824
	VEBR: 0.0
	VERB: 0.9035532994923858
	X: 0.5263157894736842



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# constructing a frame containing problematic annotations
frame = get_problematic_anns(
    original_text.gold_standard, 
    tagger_anns.only_annotations, 
    tagger_anns.processed_annotations, 
    original_text.tokens
)

In [14]:
# printing out the frame
frame

Unnamed: 0,Token,Context,Gold Standard,Prediction,Confidence
0,syn,Rypnin syn komornika,NOUN,ADP,0.446107
1,Komornika,Jana Komornika ziemi,NOUN,PROPN,0.967253
2,ziemi,Komornika ziemi Dobrrzyńskiej,NOUN,VERB,0.494722
3,właścicieli,Pinińskich właścicieli Dóbr,NOUN,VERB,0.440838
4,Dóbr,właścicieli Dóbr Strużewo,NOUN,PROPN,0.997932
5,śp,że śp Dziad,X,NOUN,0.792097
6,Dziad,śp Dziad mój,NOUN,PROPN,0.780149
7,mój,Dziad mój był,DET,VERB,0.513002
8,najmłodszy,był najmłodszy i,ADJ,NOUN,0.540951
9,piąty,i piąty z,ADJ,NOUN,0.875564


In [15]:
# printing out the frame sorted by gold standard values
frame.sort_values('Gold Standard')

Unnamed: 0,Token,Context,Gold Standard,Prediction,Confidence
535,cały,już cały swój,ADJ,VERB,0.635845
470,zacny,– zacny tylko,ADJ,VERB,0.387036
472,klęczącym,służba klęczącym ze,ADJ,NOUN,0.691085
211,mało,także mało zostawało,ADJ,PRON,0.202186
112,Starsi,. Starsi bracia,ADJ,NUM,0.284293
328,Domnikalnego,reprezentanta Domnikalnego czyli,ADJ,PROPN,0.796519
218,sam,", sam kontuzjowany",ADJ,SCONJ,0.408355
220,nieżywy,jak nieżywy –,ADJ,NOUN,0.532033
221,cały,– cały pułk,ADJ,VERB,0.527686
325,urzędowey,manipulacyi urzędowey –,ADJ,NOUN,0.806289


### RESULTS (lowercase)

In [16]:
# creating an OriginalAnnotations object to store the tokens and original annotations, in lowercase
original_lowercase = OriginalAnnotations(text, lowercase=True)

In [17]:
# creating a TaggerAnnotations object to retrieve and store tagger annotations for lowercase tokens
tagger_lowercase = TaggerAnnotations(URL, PARAMS, original_lowercase.tokens)

Loading annotations...: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 434552.84it/s]
Loading annotations...: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 528382.97it/s]
Loading annotations...: 100%|███████████████████████████████████████████████████| 1270/1270 [00:00<00:00, 460354.86it/s]


In [18]:
# sanity check
for i in range(0,len(tagger_lowercase.processed_annotations)):
    if tagger_lowercase.processed_annotations[i][0] != original_lowercase.tokens[i]:
        print(f'mismatch at {i}: {tagger_lowercase.processed_annotations[i][0]} and {original_lowercase.tokens[i]}')
        print()

In [19]:
# evaluating the tagger annotations on lowercase tokens
get_measures(original_lowercase.gold_standard, tagger_lowercase.only_annotations)

MEASURES:
Accuracy: 0.8015290519877676
Matthew's Correlation Coefficient: 0.7742747490318258

MEASURES PER CLASS:
Precision:
	ADJ: 0.5170454545454546
	ADP: 0.9536231884057971
	ADV: 0.6781609195402298
	AUX: 0.7391304347826086
	CCONJ: 0.975
	DET: 0.96
	NOUN: 0.7333333333333333
	NUM: 0.6944444444444444
	PART: 0.8260869565217391
	PRON: 0.83125
	PROPN: 0.0
	PUNCT: 1.0
	SCONJ: 0.75
	VEBR: 0.0
	VERB: 0.8221709006928406
	X: 0.64
Recall:
	ADJ: 0.7615062761506276
	ADP: 0.9733727810650887
	ADV: 0.7195121951219512
	AUX: 0.7611940298507462
	CCONJ: 0.9454545454545454
	DET: 0.47368421052631576
	NOUN: 0.8944174757281553
	NUM: 0.5813953488372093
	PART: 0.6129032258064516
	PRON: 0.9236111111111112
	PROPN: 0.0
	PUNCT: 1.0
	SCONJ: 0.4411764705882353
	VEBR: 0.0
	VERB: 0.9035532994923858
	X: 0.42105263157894735



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# constructing a frame containing problematic annotations for lowercase tokens
frame_lower = get_problematic_anns(
    original_lowercase.gold_standard, 
    tagger_lowercase.only_annotations, 
    tagger_lowercase.processed_annotations, 
    original_lowercase.tokens
)

In [21]:
# printing out the frame
frame_lower

Unnamed: 0,Token,Context,Gold Standard,Prediction,Confidence
0,melchior,mój melchior urodzony,PROPN,NOUN,0.94252
1,godziszewo,wsi godziszewo parafii,PROPN,NOUN,0.760044
2,rypnin,parafii rypnin syn,PROPN,ADJ,0.428394
3,jana,ziemskiego jana komornika,PROPN,NOUN,0.96411
4,dobrrzyńskiej,ziemi dobrrzyńskiej (,PROPN,ADJ,0.96334
5,antoniego,nie antoniego –,PROPN,ADJ,0.959326
6,antoni,– antoni był,PROPN,NOUN,0.447708
7,melchiora,. melchiora –,PROPN,NOUN,0.935199
8,ewy,matki ewy z,PROPN,VERB,0.682664
9,pinińskich,z pinińskich właścicieli,PROPN,ADJ,0.81835


In [22]:
# printing out the frame sorted by gold standard values
frame_lower.sort_values('Gold Standard')

Unnamed: 0,Token,Context,Gold Standard,Prediction,Confidence
552,gołey,na gołey podłodze,ADJ,NOUN,0.732511
118,około,być około roku,ADJ,ADP,0.697646
257,drugi,– drugi jakiś,ADJ,NOUN,0.485233
350,oycowskiey,. oycowskiey .,ADJ,X,0.399155
420,nieźle,się nieźle i,ADJ,ADV,0.585674
513,dobrze,nie dobrze poszła,ADJ,ADV,0.927003
605,młodą,piękną młodą żoneczką,ADJ,NOUN,0.947394
286,nieżywy,jak nieżywy –,ADJ,NOUN,0.532033
421,lepiey,nieraz lepiey jak,ADJ,VERB,0.532286
370,urzędowa,pieczęć urzędowa jurysdykcyi,ADJ,NOUN,0.769623
