In [1]:
import pickle
import requests
import math

In [2]:
# from https://cloud.gate.ac.uk/shopfront/displayItem/tagger-pos-pl-maxent1 - "A POS tagger for pl / Polish using the Universal 
# Dependencies POS tagset.
# This tagger is based on a simple maximum entropy model trained on the corpus from the universal dependencies collection using 
# the GATE Learning Framework plugin.
# The model is trained on all available corpora, except the test corpus. Evaluation on the test set gives 0.9456 accuracy. 
# Accuracy on out-of-vocabulary words (words not seen in the trainin set) is 0.9122 (case-sensitive) / 0.9174 (not 
# case-sensitive)."

URL = 'https://cloud-api.gate.ac.uk/process/tagger-pos-pl-maxent1'
PARAMS = {
    'Content-Type': 'text/plain',
    'Accept': 'application/json'
}
file = 'memoirs_annotated_3k.txt'

In [3]:
def retrieve_text(filename):
    with open(filename) as f:
        lines = f.readlines()
        
        all_lines = []
        for line in lines:
            all_lines.append(line.strip())
        return all_lines

In [15]:
text = retrieve_text(file)

In [5]:
class OriginalAnnotations:
    def __init__(self, data):
        self.tokens = []
        self.gold_standard = []
        
        for sentence in text:
            sentence.strip()
            sentence = sentence.split(" ")
            for annotated_token in sentence:
                if annotated_token[0] == '[':
                    annotated_token = annotated_token[1:]
                if annotated_token[-1] == ']':
                    annotated_token = annotated_token[:-1]
                split_token = annotated_token.split('_')
                self.tokens.append(split_token[0])
                self.gold_standard.append(split_token[1])

In [6]:
original_text = OriginalAnnotations(text)

In [12]:
class TaggerAnnotations:
    def __init__(self, url, headers, data):
        
        self.processed_annotations = []
        self.only_annotations = []
        
        if len(data) > 1000:
            prev_i = 0
            for i in range(1, math.floor(len(data)/1000)):
                subset = data[prev_i*1000:i*1000]
                prev_i = i
                subset = ' '.join(subset)
                r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
                annotations = r.json()
                for entry in annotations['entities']['Token']:
                    self.__retrieve_anns(entry)
                    
            subset = data[prev_i*1000:]
            subset = ' '.join(subset)
            r = requests.post(url=url, data=subset.encode('utf-8'), headers=headers)
            annotations = r.json()
            for entry in annotations['entities']['Token']:
                self.__retrieve_anns(entry)        
            
        
        else:
            data = ' '.join(data)

            r = requests.post(url=url, data=data.encode('utf-8'), headers=headers) 
            self.annotations = r.json()

            for entry in self.annotations['entities']['Token']:
                self.__retrieve_anns(entry)

    def print_annotations(self):
        for entry in self.processed_annotations:
            print(entry[0] + ' | ' + entry[1] + ' | ' + str(entry[2]))
            
    def __retrieve_anns(self, entry):
        word = entry['string']
        pos = entry['upos']
        confidence = entry['LF_confidence']
        target_list = entry['LF_target_list']
        confidence_list = entry['LF_confidence_list']

        annotation = [word, pos, confidence, target_list, confidence_list]
        self.processed_annotations.append(annotation)
        self.only_annotations.append(pos)
        

In [13]:
tagger_anns_1 = TaggerAnnotations(URL, PARAMS, original_text.tokens)

In [14]:
for i in range(0,len(tagger_anns_1.processed_annotations)):
    if tagger_anns_1.processed_annotations[i][0] != original_text.tokens[i]:
        print(f'mismatch at {i}: {tagger_anns_1.processed_annotations[i][0]} and {original_text.tokens[i]}')
        print()