# MORFEUSZ & CONCRAFT - PREANNOTATION

### IMPORTS, VARIABLES

In [1]:
# https://github.com/kawu/concraft-pl
# https://github.com/kawu/concraft-pl/tree/master/bindings/python

In [2]:
import sys
import pandas as pd
import os
sys.path.append('../concraft-pl/bindings/python/')

In [82]:
from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

server = Server(model_path='../concraft-pl/model-SGJP.gz')
file = '../data/memoirs_annotated_10k.txt'
original_file = '../data/memoirs_10k.txt'

In [77]:
morfeusz = Morfeusz(expand_tags=True)
concraft = Concraft()

### FUNCTIONS AND CLASSES

In [5]:
def best_interpretation(dag_disamb: list):
    '''A function that allows for the selection of only the best possible morphosyntactic interpretation of a sentence as
    returned by Morfeusz2 + Concraft.
    
    Args:
        dag_disamb (list): A list of possible interpretations returned by Concraft based on Morfeusz2's analysis.
        
    Returns:
        A list containing only the highest probability interpretations for every token.    
    '''
    
    while("" in dag_disamb):
        dag_disamb.remove("")
    
    best_inter = []
    
    for item in dag_disamb:
        if item[0] == len(best_inter):
            best_inter.append(item)
        else:
            if item[3] > best_inter[-1][3]:
                best_inter[-1] = item
        
            
    return best_inter

In [192]:
class OriginalAnnotations:
    '''A class intended to process and store the tokens and their respective annotations from the original text.
    
    Attributes:
        tokens (list[str]): A list of all the tokens (without annotation) in the original data. Every element of the list is a string.
        gold_standard (list[str]): A list of all the original annotations (without tokens). Every element is a string.
        sentences (list[str]): A list of the original sentences (either truly original, if available, or reconstructed).
        sentences_tokenized (list[list[str]]): A list of lists representing tokenized, unannotated sentences.
        gold_standard_tokenized (list[list[str]]): A list of lists representing the gold standard annotations per sentence.
        
    '''
    def __init__(self, filename: str, lowercase: bool = False, original: str = '', nested: bool = True):
        '''The __init__ method of the class.
        Constructs the token and tag lists.
        
        Args:
            filename (str): The name of the file that the text and the annotations are to be obtained from.
            lowercase (bool): Determines whether the tokens should be lowercased or if original capitalization should be retained.
            original (str): The name of the file containing the original unannotated sentences. If left empty or invalid,
                reconstructed sentences will be used.
            nested (bool): Determines whether the output list contains nested sublists.
        '''
        # opening the annotated text
        text = []
        with open(filename) as f:
            lines = f.readlines()
            for line in lines:
                text.append(line.strip())     
        
        # setting up attributes
        self.tokens = []
        self.gold_standard = []
        
        self.sentences = []
        
        self.sentences_tokenized = []
        self.gold_standard_tokenized = []
        
        # reading in unannotated sentences, if applicable
        if os.path.exists(original):
            with open(original) as f:
                lines = f.readlines()
                for line in lines:
                    self.sentences.append(line.strip())
        else:
            original = ''
        
        # splitting tokens and annotation
        for sentence in text:
            # setting up temporary lists to store the values per sentence
            temp_sent = []
            temp_anns = []
            temp_conc = []
            temp_conc_ann = []
            # setting up a string for potential concatenation of split elements
            full_sent = ''
            # setting up variables controlling the extent of concatenation / grouping together of elements
            concatenate = False
            concatenate_next = False
            
            sentence = sentence.strip()
            sentence = sentence.split(" ")
            
            for annotated_token in sentence:
                
                if concatenate_next == True:
                    concatenate = False
                    concatenate_next = False
                
                split_token = annotated_token.split('_')
                
                if split_token[0][0] == '[':
                    split_token[0] = split_token[0][1:]
                    concatenate = True
                
                if split_token[-1].endswith('AUX]'):
                    split_token[-1] = split_token[-1][:-1]
                    concatenate = False
                
                if split_token[-1][-1] == ']':
                    split_token[-1] = split_token[-1][:-1]
                    concatenate_next = True
                    
                if concatenate:
                    full_sent += split_token[0]
                    temp_conc.append(split_token[0])
                    temp_conc_ann.append(split_token[1])
                else:
                    full_sent += (split_token[0] + ' ')
                    if len(temp_conc) != 0:
                        temp_sent.append(temp_conc)
                        temp_anns.append(temp_conc_ann)
                        temp_conc = []
                        temp_conc_ann = []
                
                if not lowercase:
                    self.tokens.append(split_token[0])
                    if not concatenate:
                        temp_sent.append(split_token[0])
                        temp_anns.append(split_token[1])
                else:
                    self.tokens.append(split_token[0].lower())
                    if not concatenate:
                        temp_sent.append(split_token[0].lower())
                        temp_anns.append(split_token[1])
                        
                self.gold_standard.append(split_token[1])
            
            if not lowercase:
                temp_sent.append(split_token[0])
                temp_anns.append(split_token[1])
            else:
                temp_sent.append(split_token[0].lower())
                temp_anns.append(split_token[1])
            self.sentences_tokenized.append(temp_sent)
            self.gold_standard_tokenized.append(temp_anns)
            
            # reconstructing sentences if necessary
            if original == '':
                # removing unnecessary whitespace
                full_sent = full_sent.replace(' .', '.')
                full_sent = full_sent.replace(' ,', ',')
                full_sent = full_sent.replace(' !', '!')
                full_sent = full_sent.replace(' ?', '?')
                full_sent = full_sent.replace(' :', ':')
                full_sent = full_sent.replace('„ ', '„')
                full_sent = full_sent.replace(' ”', '”')
                full_sent = full_sent.replace('( ', '(')
                full_sent = full_sent.replace(' )', ')')
                full_sent = full_sent.strip()
                self.sentences.append(full_sent)
                
        # removing the "nested" lists from tokenized sentences and their annotations that represent
        # elements that should go together
        if not nested:
            for i, sent in enumerate(self.sentences_tokenized):
                temp_sent = []
                for token in sent:
                    if isinstance(token, str):
                        temp_sent.append(token)
                    else:
                        for element in token:
                            temp_sent.append(element)
                self.sentences_tokenized[i] = temp_sent
                
            for i, sent in enumerate(self.gold_standard_tokenized):
                temp_ann = []
                for token in sent:
                    if isinstance(token, str):
                        temp_ann.append(token)
                    else:
                        for element in token:
                            temp_ann.append(element)
                self.gold_standard_tokenized[i] = temp_ann
                
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of self.tokens, which should be identical to the length of self.gold_standard.
        '''
        return len(self.tokens)
        
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and annotation.
        '''
        token = self.tokens[index]
        annotation = self.gold_standard[index]
        item = '_'.join([token, annotation])
            
        return item
    
    def frequencies(self):
        '''A method of the class indended for displaying raw and relative frequencies of word classes in the annotation.
        
        Returns:
            A dataframe representing the POS tag, raw frequency, relative frequency.
        '''
        freqs = []
        for item in list(set(self.gold_standard)):
            raw = self.gold_standard.count(item)
            relative = raw / len(self.gold_standard)
            
            freqs.append([item, raw, relative])
            
        freq_pd = pd.DataFrame(freqs, columns=['POS', 'raw', 'relative']).sort_values('relative', ascending=False).set_index('POS')
            
        return freq_pd

In [193]:
# conllu format
# blank lines for sent boundaries, comment lines with hash
# ID - word index, can be a range for multiword tokens, starts at 1
# FORM - word form
# LEMMA - word lemma
# UPOS - UPOS tag
# XPOS - XPOS tag
# FEATS - morphological features
# HEAD 
# DEPREL
# DEPS
# MISC
# use _ if empty
# fields are tab separated

In [194]:
class ConlluWord():
    '''A class intended to represent a line in a conll file (1 word, respectively)
    
    Attributes:
        ID (str): The index number of the word.
        FORM (str): The word as it appears in the sentence.
        LEMMA (str): The lemma of the word.
        UPOS (str): Universal Dependencies' universal part of speach tag.
        XPOS (str): Language-specific part of speech tag.
        FEATS (str): Features. Empty by default.
        HEAD (str): Signifies which word is the head of this one. Empty by default.
        DEPREL (str): Represents the dependency relations. Empty by default.
        DEPS (str): The dependencies of this word. Empty by default.
        MISC (str): Miscellaneous information. Empty by default.
        
    '''
    def __init__(
        self, ID: str, FORM: str, LEMMA: str, UPOS: str, XPOS: str, FEATS: str = '_', 
        HEAD: str = '_', DEPREL: str = '_', DEPS: str = '_', MISC: str = '_'
    ):
        '''The __init__ method of the class.
        Assigns the values to the ConLLu tags.
        
        Args:
            ID (str): The index number of the word.
            FORM (str): The word as it appears in the sentence.
            LEMMA (str): The lemma of the word.
            UPOS (str): Universal Dependencies' universal part of speach tag.
            XPOS (str): Language-specific part of speech tag.
            FEATS (str): Features. Empty by default.
            HEAD (str): Signifies which word is the head of this one. Empty by default.
            DEPREL (str): Represents the dependency relations. Empty by default.
            DEPS (str): The dependencies of this word. Empty by default.
            MISC (str): Miscellaneous information. Empty by default.
        '''
        self.ID = ID
        self.FORM = FORM
        self.LEMMA = LEMMA
        self.UPOS = UPOS
        self.XPOS = XPOS
        self.FEATS = FEATS
        self.HEAD = HEAD
        self.DEPREL = DEPREL
        self.DEPS = DEPS
        self.MISC = MISC
        
    def return_line(self):
        '''A method of the class that returns all the tags in the form of a tab-separated string, as per the ConLLu format.
        '''
        elements = [
            self.ID, self.FORM, self.LEMMA, self.UPOS, self.XPOS, self.FEATS, self.HEAD, 
            self.DEPREL, self.DEPS, self.MISC]
        line = "\t".join(elements)
        return line

In [261]:
class ConlluFormatter():
    '''A class intended to create a representation of the input text in ConLLu format using pre-annotated tags as well as
    annotation from Morfeusz and Concraft.
    
    Attributes:
        all_conll_sents (list[list]): A list of lists representing the sentences with their annotation stored.        
    '''
    def __init__(
        self, 
        sents: list, 
        tokenized: list, 
        anns: list, 
        morfeusz=morfeusz, 
        concraft=concraft
    ):
        '''The __init__ method of the class.
        Creates a list for every sentence in the input that contains annotations for every word using the ConlluWord class.
        
        Args:
            sents (list[str]): A list of sentences as strings.
            tokenized (list[list[str]]): A list of tokenized sentences (in the same order as in sents).
            anns (list[list[str]]): A list of annotations corresponding to the words in tokenized.
            morfeusz (Morfeusz): a Morfeusz object that will be used for morphological analysis of the sentences.
            concraft (Concraft): a Concraft object that will be used for morphological disambiguation and annotation.
        '''
        # sanity check
        if len(sents)!=len(tokenized) or len(sents)!=len(anns):
            print('Input lists not of equal length.')
            return
        
        self.all_conll_sents = []
        
        for i, sent in enumerate(sents):
            conll_sent = []
            
            dag = morfeusz.analyse(sent)
            dag_disamb = concraft.disamb(dag)
            best_inter = best_interpretation(dag_disamb)
            
            ann = anns[i]
            tokens = tokenized[i]
            
            offset = 0
            
            for j, inter in enumerate(best_inter):
                idx = str(j + 1)
                form = inter[2][0]
                if form == "ś":
                    if best_inter[j-1][2][0].endswith('ś'):
                        offset += -1
                        continue
                lemma = inter[2][1].split(':')[0]
                try:
                    if tokens[j+offset] == form:
                        upos = ann[j+offset]      
                    elif tokens[j+offset] + tokens[j+offset+1] == form:
                        upos = ann[j+offset]
                        offset += 1
                    else:    
                        upos = '_'
                except IndexError:
                    continue
       
                xpos = inter[2][2]
                
                word = ConlluWord(idx, form, lemma, upos, xpos)
                conll_sent.append(word)
            
            self.all_conll_sents.append(conll_sent)
            
    def print_conll(self):
        continue
        # need to still add sent id and sent text above
        
    def write_conll_2_file(self, filename: str):
        continue
        # will need to utilize print_conll  

### TESTING AROUND

In [262]:
original_text = OriginalAnnotations(file, lowercase=False, nested=False)
#print(original_text.sentences)

In [263]:
formatter = ConlluFormatter(original_text.sentences[80:], original_text.sentences_tokenized[80:], original_text.gold_standard_tokenized[80:])

In [264]:
formatter.all_conll_sents


[[<__main__.ConlluWord at 0x7f88e23b1c40>,
  <__main__.ConlluWord at 0x7f88e3de0850>,
  <__main__.ConlluWord at 0x7f8911d7c280>,
  <__main__.ConlluWord at 0x7f8911934040>,
  <__main__.ConlluWord at 0x7f88e1c89880>,
  <__main__.ConlluWord at 0x7f88e1c89e80>,
  <__main__.ConlluWord at 0x7f88e1c89610>,
  <__main__.ConlluWord at 0x7f88e1c89730>,
  <__main__.ConlluWord at 0x7f88e1c891f0>,
  <__main__.ConlluWord at 0x7f8911d68f40>,
  <__main__.ConlluWord at 0x7f89705864f0>,
  <__main__.ConlluWord at 0x7f8970586df0>,
  <__main__.ConlluWord at 0x7f8970516fd0>,
  <__main__.ConlluWord at 0x7f8911b22d30>,
  <__main__.ConlluWord at 0x7f8911d6b730>,
  <__main__.ConlluWord at 0x7f8911d6b130>,
  <__main__.ConlluWord at 0x7f88e2272730>,
  <__main__.ConlluWord at 0x7f88e2272880>,
  <__main__.ConlluWord at 0x7f88e2272c70>,
  <__main__.ConlluWord at 0x7f88e2272280>,
  <__main__.ConlluWord at 0x7f88e22727f0>,
  <__main__.ConlluWord at 0x7f88e2272550>,
  <__main__.ConlluWord at 0x7f88e2272640>,
  <__main__

In [260]:
            print_toggle = False
            for word in all_conll_sents[-1]:
                if word.UPOS == "_":
                    print_toggle = True
            if print_toggle:
                print()
                print(i)
                for word in all_conll_sents[-1]:
                    print(word.return_line())
                
            #if len(best_inter) != len(tokenized[i]):
                #print(i)
                print([x[2][0] for x in best_inter])
                print(tokenized[i])
                print(anns[i])
                #print(len(best_inter))
                #print(len(tokenized[i]))
                #print(len(anns[i]))
                #for word in all_conll_sents[-1]:
                    #print(word.return_line())

NameError: name 'all_conll_sents' is not defined

### EXECUTION

In [252]:
dag = morfeusz.analyse(original_text.sentences[312])
dag_disamb = concraft.disamb(dag)

In [253]:
best_interpretation(dag_disamb)

[(0,
  1,
  ('Przytoczył', 'przytoczyć', 'praet:sg:m1:perf', [], []),
  '1.0000',
  None,
  'disamb'),
 (1,
  2,
  ('em', 'być', 'aglt:sg:pri:imperf:wok', [], []),
  '1.0000',
  None,
  'disamb'),
 (2, 3, ('tu', 'tu', 'adv', [], []), '1.0000', None, 'disamb'),
 (3, 4, ('to', 'ten', 'adj:sg:acc:n:pos', [], []), '0.9695', None, 'disamb'),
 (4,
  5,
  ('zdarzenie', 'zdarzenie', 'subst:sg:acc:n:ncol', ['nazwa_pospolita'], []),
  '0.9701',
  None,
  'disamb'),
 (5, 6, (',', ',', 'interp', [], []), '1.0000', None, 'disamb'),
 (6, 7, ('żeby', 'żeby:M', 'comp', [], []), '1.0000', None, 'disamb'),
 (7, 8, ('mieć', 'mieć', 'inf:imperf', [], []), '1.0000', None, 'disamb'),
 (8,
  9,
  ('wyobrażenie',
   'wyobrażenie',
   'subst:sg:acc:n:ncol',
   ['nazwa_pospolita'],
   []),
  '0.8900',
  None,
  'disamb'),
 (9, 10, ('jak', 'jak:C', 'conj', [], []), '0.3542', None, None),
 (10,
  11,
  ('znaczny', 'znaczny', 'adj:sg:nom:m3:pos', [], []),
  '0.9452',
  None,
  'disamb'),
 (11,
  12,
  ('zasiłek', 