# GATE CLOUD UD TAGGER EVALUATION

### IMPORTS, VARIABLES

In [1]:
import requests
import math
import sklearn.metrics
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_rows', None)

In [2]:
# from https://cloud.gate.ac.uk/shopfront/displayItem/tagger-pos-pl-maxent1 - "A POS tagger for pl / Polish using the Universal 
# Dependencies POS tagset.
# instructions on how to use the API from https://cloud.gate.ac.uk/info/help/online-api.html

URL = 'https://cloud-api.gate.ac.uk/process/tagger-pos-pl-maxent1'
PARAMS = {
    'Content-Type': 'text/plain',
    'Accept': 'application/json'
}

# because I am reusing code from an earlier project, it will be easier for me to import the data from
file = '../data/memoirs_10k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [3]:
from functions import *
from preproc_bert import remove_ranges

### FUNCTIONS AND CLASSES

In [4]:
class TaggerAnnotations:
    '''A class intended for retrieving, processing, and storing the annotations from an online tagger.
    
    Attributes:
        processed_annotations (list[list]): A list of lists, every element of which represents a number of values returned by the
            tagger. The values include the token, the POS tag, the confidence thereof, a list of possible targets and a list
            of those confidences.
        only_annotations (list[str]): A list containing only the POS tags obtained from the tagger.  
        data (list[str]): The original input data.
    '''
    def __init__(self, url: str, headers: dict, data: list):
        '''The __init__ method of the class.
        Retrieves the tagger annotations (in batches, if needed), constructs processed_annotations and only_annotations using
        the __retrieve_anns method.
        
        Args:
            url (str): The URL to which the query should be made.
            headers (dict): The parameters that the query requires.
            data (list[list[str]]): The original, unannotated data represented as a list of lists of strings.        
        '''
        self.processed_annotations = []
        self.only_annotations = []
        self.data = data      
    
        for i, sent in enumerate(tqdm(data, desc='Loading sentences...')):
            sent = ' '.join(sent)

            r = requests.post(url=url, data=sent.encode('utf-8'), headers=headers) 
            annotations = r.json()

            for entry in annotations['entities']['Token']:
                self.__retrieve_anns(entry)
                
        self.__sanity_check()      

    def print_annotations(self):
        '''A method of the class which prints out all words with their annotation and the confidence thereof.
        '''
        for entry in self.processed_annotations:
            print(entry[0] + ' | ' + entry[1] + ' | ' + str(entry[2]))
            
    def __retrieve_anns(self, entry: dict):
        '''A method of the class which reads the data returned by the tagger and stores the relevant elements in appropriate
        lists.
        
        Args:
            entry (dict): An entry returned by the tagger.
        '''
        word = entry['string']
        pos = entry['upos']
        confidence = entry['LF_confidence']
        target_list = entry['LF_target_list']
        confidence_list = entry['LF_confidence_list']

        annotation = [word, pos, confidence, target_list, confidence_list]
        self.processed_annotations.append(annotation)
        self.only_annotations.append(pos)
        
    def __sanity_check(self):
        '''A method of the class that allows for the fixing of length mismatched between the input and the output due
        to the tagger tokenizing the text differently than the annotator. The annotation lists only preserve the annotation
        corresponding to the last part of the mistokenized item, as that one most commonly defines the word class. This works 
        up to mistokenizations of the length of 3.
        '''
        for i, token in enumerate([x for sentence in self.data for x in sentence]):
            if self.processed_annotations[i][0] != token:
                if self.processed_annotations[i][0] + self.processed_annotations[i+1][0] == token:
                    del self.processed_annotations[i]
                    del self.only_annotations[i]
                elif self.processed_annotations[i][0] + self.processed_annotations[i+1][0] + self.processed_annotations[i+2][0] == token:
                    del self.processed_annotations[i]
                    del self.only_annotations[i]
                    del self.processed_annotations[i+1]
                    del self.only_annotations[i+1]
        
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of self.only_annotations, which should be identical to the length of self.processed_annotations.
        '''
        return len(self.only_annotations)
        
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and the tagger annotation.
        '''
        full_ann = self.processed_annotations[index]
        token = full_ann[0]
        annotation = full_ann[1]
        item = '_'.join([token, annotation])
            
        return item

### EXECUTION - STANDARD

In [5]:
#test_tokens_features = extract_conllu_data(test_file, 'upos', sentences=False, combined=True, fulltext=False)
#test_tokens, test_features = make_tagger_friendly(test_tokens_features)

In [6]:
#print(len(test_tokens))
# due to limitations of the tagger, giving it more than 30k tokens is impossible within one notebook - even across separate
# instances of the class object; therefore, only a chunk of the test data can be fed to the tagger 
#test_tokens = test_tokens[:20000]
#test_features = test_features[:20000]

In [7]:
#test_tagger_anns = TaggerAnnotations(URL, PARAMS, test_tokens)

In [8]:
#get_measures([x for sentence in test_features for x in sentence], tagger_anns.only_annotations, details=True)

### EXECUTION - PREDICTIONS

In [9]:
tokens_features = extract_conllu_data(file, 'upos', sentences=True, combined=True, fulltext=False)
tokens, features = make_tagger_friendly(tokens_features)

In [10]:
tagger_anns = TaggerAnnotations(URL, PARAMS, tokens)

Loading sentences...:   8%|████▊                                                       | 29/360 [00:13<02:30,  2.20it/s]


KeyError: 'entities'

In [None]:
get_measures([x for sentence in features for x in sentence], tagger_anns.only_annotations, details=True)

In [None]:
comparison = get_comparison(
    [x for sentence in features for x in sentence], 
    tagger_anns.only_annotations, 
    [x for sentence in tokens for x in sentence], 
    confidence=[x[2] for x in tagger_anns.processed_annotations])
    
comparison.to_excel('../data/mistakes/UD_UPOS_mistakes.xlsx')

In [None]:
comparison