In [40]:
import pickle
import requests

In [41]:
# from https://cloud.gate.ac.uk/shopfront/displayItem/tagger-pos-pl-maxent1 - "A POS tagger for pl / Polish using the Universal 
# Dependencies POS tagset.
# This tagger is based on a simple maximum entropy model trained on the corpus from the universal dependencies collection using 
# the GATE Learning Framework plugin.
# The model is trained on all available corpora, except the test corpus. Evaluation on the test set gives 0.9456 accuracy. 
# Accuracy on out-of-vocabulary words (words not seen in the trainin set) is 0.9122 (case-sensitive) / 0.9174 (not 
# case-sensitive)."

URL = 'https://cloud-api.gate.ac.uk/process/tagger-pos-pl-maxent1'
PARAMS = {
    'Content-Type': 'text/plain',
    'Accept': 'application/json'
}
file = 'memoirs_annotated_3k.txt'

In [58]:
def retrieve_text(filename, split_lines=True):
    with open(filename) as f:
        lines = f.readlines()
        
    if split_lines==True:
        all_lines = []
        for line in lines:
            all_lines.append(line.strip())
        return all_lines
    else:
        full_text = []
        for line in lines:
            line = line.strip('\n')
            full_text.append(line)
        full_text = " ".join(full_text)
        return full_text

In [64]:
text = retrieve_text(file)
print(text)

['Dziad_NOUN mój_DET Melchior_PROPN urodzony_ADJ roku_NOUN 1741_ADJ we_ADP wsi_NOUN Godziszewo_PROPN parafii_NOUN Rypnin_PROPN syn_NOUN komornika_NOUN ziemskiego_ADJ Jana_PROPN Komornika_NOUN ziemi_NOUN Dobrrzyńskiej_PROPN (_PUNCT a_CCONJ nie_PART Antoniego_PROPN –_PUNCT Antoni_PROPN był_AUX starszym_ADJ bratem_NOUN [ś_X ._PUNCT p_X ._PUNCT] Melchiora_PROPN –_PUNCT i_CCONJ nie_PART rozumiem_VERB skąd_ADV się_PRON wzięła_VERB ta_DET myłka_NOUN ,_PUNCT która_DET później_ADV wdokumentach_NOUN się_PRON powtarza_VERB )_PUNCT i_CCONJ matki_NOUN Ewy_PROPN z_ADP Pinińskich_PROPN właścicieli_NOUN Dóbr_NOUN Strużewo_PROPN z_ADP adlinencjami_NOUN Puszczanki_PROPN ,_PUNCT Dąbrowy_PROPN części_NOUN ,_PUNCT Ronantowizna_PROPN ,_PUNCT Żółtowizna_PROPN ,_PUNCT Będowszczyazna_PROPN ,_PUNCT Kmińszczyzna_PROPN ,_PUNCT Bęklowizna_PROPN ,_PUNCT Ruszkowizna_PROPN ._PUNCT', 'A_CCONJ że_SCONJ śp_X Dziad_NOUN mój_DET był_AUX najmłodszy_ADJ i_CCONJ piąty_ADJ z_ADP pomiędzy_ADP Rodzeństwa_NOUN –_PUNCT odstąpił_V

In [75]:
class OriginalAnnotations:
    def __init__(self, data):
        self.tokens = []
        self.gold_standard = []
        
        for sentence in text:
            sentence.strip()
            sentence = sentence.split(" ")
            for annotated_token in sentence:
                if annotated_token[0] == '[':
                    annotated_token = annotated_token[1:]
                if annotated_token[-1] == ']':
                    annotated_token = annotated_token[:-1]
                split_token = annotated_token.split('_')
                self.tokens.append(split_token[0])
                self.gold_standard.append(split_token[1])

In [76]:
original_text = OriginalAnnotations(text)

In [87]:
class TaggerAnnotations:
    def __init__(self, url, headers, data):
        if isinstance(data, list):
            data = ' '.join(data)
            
        r = requests.post(url=url, data=data.encode('utf-8'), headers=headers) 
        self.annotations = r.json()
        
        self.processed_annotations = []
        self.only_annotations = []
        for entry in self.annotations['entities']['Token']:
            word = entry['string']
            pos = entry['upos']
            confidence = entry['LF_confidence']
            target_list = entry['LF_target_list']
            confidence_list = entry['LF_confidence_list']

            annotation = [word, pos, confidence, target_list, confidence_list]
            self.processed_annotations.append(annotation)
            self.only_annotations.append(pos)

    def print_annotations(self):
        for entry in self.processed_annotations:
            print(entry[0] + ' | ' + entry[1] + ' | ' + str(entry[2]))
        

In [91]:
tagger_anns_1 = TaggerAnnotations(URL, PARAMS, original_text.tokens[:1000])

In [92]:
tagger_anns_2 = TaggerAnnotations(URL, PARAMS, original_text.tokens[1000:2000])

In [93]:
tagger_anns_3 = TaggerAnnotations(URL, PARAMS, original_text.tokens[2000:])

Dziad | NOUN | 0.7880786605621269
mój | DET | 0.2910872818352366
Melchior | PROPN | 0.9882690037441527
urodzony | ADJ | 0.9838059834469768
roku | NOUN | 0.9893036802508473
1741 | ADJ | 0.8396864437394121
we | ADP | 0.8037721315042852
wsi | NOUN | 0.9555202276615747
Godziszewo | PROPN | 0.8356249362998734
parafii | NOUN | 0.8281642944816713
Rypnin | PROPN | 0.9761848889223718
syn | ADP | 0.44610724347153724
komornika | NOUN | 0.9921092663869862
ziemskiego | ADJ | 0.8841559648575619
Jana | PROPN | 0.9862540896222082
Komornika | PROPN | 0.9672533951057602
ziemi | VERB | 0.49472188150793944
Dobrrzyńskiej | PROPN | 0.6015370726961496
( | PUNCT | 0.9918210601889236
a | CCONJ | 0.9402611173855633
nie | PART | 0.9244890288853184
Antoniego | PROPN | 0.9069279540341488
– | PUNCT | 0.9999364511646827
Antoni | PROPN | 0.991452151960567
był | AUX | 0.9803619371237606
starszym | ADJ | 0.9942088789940817
bratem | NOUN | 0.9645196816687935
ś | X | 0.41795764186248896
. | PUNCT | 0.9999997391795165
p |