# MORFEUSZ & CONCRAFT - PREANNOTATION

### IMPORTS, VARIABLES

In [1]:
# https://github.com/kawu/concraft-pl
# https://github.com/kawu/concraft-pl/tree/master/bindings/python

In [2]:
import sys
import pandas as pd
import os
sys.path.append('../concraft-pl/bindings/python/')

In [3]:
from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

server = Server(model_path='../concraft-pl/model-SGJP.gz')
file = '../data/memoirs_annotated_10k.txt'
original_file = '../data/memoirs_10k.txt'

PermissionError: [Errno 13] Permission denied: 'concraft-pl'

In [None]:
morfeusz = Morfeusz(expand_tags=True)
concraft = Concraft()

### FUNCTIONS AND CLASSES

In [None]:
def best_interpretation(dag_disamb: list):
    '''A function that allows for the selection of only the best possible morphosyntactic interpretation of a sentence as
    returned by Morfeusz2 + Concraft.
    
    Args:
        dag_disamb (list): A list of possible interpretations returned by Concraft based on Morfeusz2's analysis.
        
    Returns:
        A list containing only the highest probability interpretations for every token.    
    '''
    
    while("" in dag_disamb):
        dag_disamb.remove("")
    
    best_inter = []
    
    for item in dag_disamb:
        if item[0] == len(best_inter):
            best_inter.append(item)
        else:
            if item[3] > best_inter[-1][3]:
                best_inter[-1] = item
        
            
    return best_inter

In [None]:
class OriginalAnnotations:
    '''A class intended to process and store the tokens and their respective annotations from the original text.
    
    Attributes:
        tokens (list[str]): A list of all the tokens (without annotation) in the original data. Every element of the list is a string.
        gold_standard (list[str]): A list of all the original annotations (without tokens). Every element is a string.
        sentences (list[str]): A list of the original sentences (either truly original, if available, or reconstructed).
        sentences_tokenized (list[list[str]]): A list of lists representing tokenized, unannotated sentences.
        gold_standard_tokenized (list[list[str]]): A list of lists representing the gold standard annotations per sentence.
        simple_sentences_tokenized (list[list[str]]): A list of lists representing the tokenized, unannotated sentences, with no regard
        as to whether the tokens were written together or not.
        simple_gold_standard_tokenized (list[list[str]]): A list of lists representing the gold standard annotations per sentence, 
        with no regard as to whether the tokens were written together or not.
    '''
    def __init__(self, filename: str, lowercase: bool = False, original: str = ''):
        '''The __init__ method of the class.
        Constructs the token and tag lists.
        
        Args:
            filename (str): The name of the file that the text and the annotations are to be obtained from.
            lowercase (bool): Determines whether the tokens should be lowercased or if original capitalization should be retained.
            original (str): The name of the file containing the original unannotated sentences. If left empty or invalid,
                reconstructed sentences will be used.
            nested (bool): Determines whether the output list contains nested sublists.
        '''
        # opening the annotated text
        text = []
        with open(filename) as f:
            lines = f.readlines()
            for line in lines:
                text.append(line.strip())     
        
        # setting up attributes
        self.tokens = []
        self.gold_standard = []
        
        self.sentences = []
        
        self.sentences_tokenized = []
        self.gold_standard_tokenized = []
        
        self.simple_sentences_tokenized = []
        self.simple_gold_standard_tokenized = []
        
        # reading in unannotated sentences, if applicable
        if os.path.exists(original):
            with open(original) as f:
                lines = f.readlines()
                for line in lines:
                    self.sentences.append(line.strip())
        else:
            original = ''
        
        # splitting tokens and annotation
        for sentence in text:
            # setting up temporary lists to store the values per sentence
            temp_sent = []
            temp_anns = []
            temp_conc = []
            temp_conc_ann = []
            # setting up a string for potential concatenation of split elements
            full_sent = ''
            # setting up variables controlling the extent of concatenation / grouping together of elements
            concatenate = False
            
            sentence = sentence.strip()
            sentence = sentence.split(" ")
            
            for annotated_token in sentence:
                
                split_token = annotated_token.split('_')
                
                if split_token[0][0] == '[':
                    split_token[0] = split_token[0][1:]
                    concatenate = True
                
                if split_token[-1].endswith(']'):
                    split_token[-1] = split_token[-1][:-1]
                    concatenate = False
                    
                if concatenate:
                    full_sent += split_token[0]
                    temp_conc.append(split_token[0])
                    temp_conc_ann.append(split_token[1])
                else:
                    full_sent += (split_token[0] + ' ')
                    if len(temp_conc) != 0:
                        temp_sent.append(temp_conc)
                        temp_anns.append(temp_conc_ann)
                        temp_conc = []
                        temp_conc_ann = []
                
                if not lowercase:
                    self.tokens.append(split_token[0])
                    if not concatenate:
                        temp_sent.append(split_token[0])
                        temp_anns.append(split_token[1])
                else:
                    self.tokens.append(split_token[0].lower())
                    if not concatenate:
                        temp_sent.append(split_token[0].lower())
                        temp_anns.append(split_token[1])
                        
                self.gold_standard.append(split_token[1])
            
            if not lowercase:
                temp_sent.append(split_token[0])
                temp_anns.append(split_token[1])
            else:
                temp_sent.append(split_token[0].lower())
                temp_anns.append(split_token[1])
            self.sentences_tokenized.append(temp_sent)
            self.gold_standard_tokenized.append(temp_anns)
            
            # reconstructing sentences if necessary
            if original == '':
                # removing unnecessary whitespace
                full_sent = full_sent.replace(' .', '.')
                full_sent = full_sent.replace(' ,', ',')
                full_sent = full_sent.replace(' !', '!')
                full_sent = full_sent.replace(' ?', '?')
                full_sent = full_sent.replace(' :', ':')
                full_sent = full_sent.replace('„ ', '„')
                full_sent = full_sent.replace(' ”', '”')
                full_sent = full_sent.replace('( ', '(')
                full_sent = full_sent.replace(' )', ')')
                full_sent = full_sent.strip()
                self.sentences.append(full_sent)
                
        # removing the "nested" lists from tokenized sentences and their annotations that represent
        # elements that should go together
        for i, sent in enumerate(self.sentences_tokenized):
            temp_sent = []
            for token in sent:
                if isinstance(token, str):
                    temp_sent.append(token)
                else:
                    for element in token:
                        temp_sent.append(element)
            self.simple_sentences_tokenized.append(temp_sent)
                
        for i, sent in enumerate(self.gold_standard_tokenized):
            temp_ann = []
            for token in sent:
                if isinstance(token, str):
                    temp_ann.append(token)
                else:
                    for element in token:
                        temp_ann.append(element)
            self.simple_gold_standard_tokenized.append(temp_ann)
                
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of self.tokens, which should be identical to the length of self.gold_standard.
        '''
        return len(self.tokens)
        
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and annotation.
        '''
        token = self.tokens[index]
        annotation = self.gold_standard[index]
        item = '_'.join([token, annotation])
            
        return item
    
    def frequencies(self):
        '''A method of the class indended for displaying raw and relative frequencies of word classes in the annotation.
        
        Returns:
            A dataframe representing the POS tag, raw frequency, relative frequency.
        '''
        freqs = []
        for item in list(set(self.gold_standard)):
            raw = self.gold_standard.count(item)
            relative = raw / len(self.gold_standard)
            
            freqs.append([item, raw, relative])
            
        freq_pd = pd.DataFrame(freqs, columns=['POS', 'raw', 'relative']).sort_values('relative', ascending=False).set_index('POS')
            
        return freq_pd

In [None]:
class ConlluWord():
    '''A class intended to represent a line in a conll file (1 word, respectively).
    
    Attributes:
        ID (str): The index number of the word.
        FORM (str): The word as it appears in the sentence.
        LEMMA (str): The lemma of the word.
        UPOS (str): Universal Dependencies' universal part of speach tag.
        XPOS (str): Language-specific part of speech tag.
        FEATS (str): Features. Empty by default.
        HEAD (str): Signifies which word is the head of this one. Empty by default.
        DEPREL (str): Represents the dependency relations. Empty by default.
        DEPS (str): The dependencies of this word. Empty by default.
        MISC (str): Miscellaneous information. Empty by default.
        
    '''
    def __init__(
        self, ID: str, FORM: str, LEMMA: str, UPOS: str, XPOS: str, FEATS: str = '_', 
        HEAD: str = '_', DEPREL: str = '_', DEPS: str = '_', MISC: str = '_'
    ):
        '''The __init__ method of the class. Assigns the values to the ConLLu tags.
        
        Args:
            ID (str): The index number of the word.
            FORM (str): The word as it appears in the sentence.
            LEMMA (str): The lemma of the word.
            UPOS (str): Universal Dependencies' universal part of speach tag.
            XPOS (str): Language-specific part of speech tag.
            FEATS (str): Features. Empty by default.
            HEAD (str): Signifies which word is the head of this one. Empty by default.
            DEPREL (str): Represents the dependency relations. Empty by default.
            DEPS (str): The dependencies of this word. Empty by default.
            MISC (str): Miscellaneous information. Empty by default.
        '''
        self.ID = ID
        self.FORM = FORM
        self.LEMMA = LEMMA
        self.UPOS = UPOS
        self.XPOS = XPOS
        self.FEATS = FEATS
        self.HEAD = HEAD
        self.DEPREL = DEPREL
        self.DEPS = DEPS
        self.MISC = MISC
        
    def return_line(self):
        '''A method of the class that returns all the tags in the form of a tab-separated string, as per the ConLLu format.
        '''
        elements = [
            self.ID, self.FORM, self.LEMMA, self.UPOS, self.XPOS, self.FEATS, self.HEAD, 
            self.DEPREL, self.DEPS, self.MISC]
        line = "\t".join(elements)
        return line

In [None]:
class ConlluSentence():
    '''A class intended to represent a sentence entry in a conll file.
    
    Attributes:
        sent_id (str): The ID of the sentence.
        sent (str): The sentence itself.
        words (list[ConlluWord]): A list of ConlluWord objects representing constituent words and their annotation.
    '''
    def __init__(self, sent_id: str, sent: str, words: list):
        '''The __init__ method of the class. Assigns the arguments to internal attributes.
        Args:
            sent_id (str): The ID of the sentence.
            sent (str): The sentence itself.
            words (list[ConlluWord]): A list of ConlluWord objects representing constituent words and their annotation.
        ''' 
        self.sent_id = sent_id
        self.sent = sent
        self.words = words
        
    def return_sent(self):
        '''A method of the class that returns a sentence entry.
        '''
        whole_sent = '\n'.join([self.sent_id, self.sent] + [x.return_line() for x in self.words])
        return whole_sent

In [None]:
class ConlluFormatter():
    '''A class intended to create a representation of the input text in ConLLu format using pre-annotated tags as well as
    annotation from Morfeusz and Concraft.
    
    Attributes:
        all_conll_sents (list[list]): A list of lists representing the sentences with their annotation stored.        
    '''
    def __init__(
        self, 
        annotations: OriginalAnnotations, 
        morfeusz=morfeusz, 
        concraft=concraft
    ):
        '''The __init__ method of the class.
        Creates a list for every sentence in the input that contains annotations for every word using the ConlluWord class.
        
        Args:
            sents (list[str]): A list of sentences as strings.
            tokenized (list[list[str]]): A list of tokenized sentences (in the same order as in sents).
            anns (list[list[str]]): A list of annotations corresponding to the words in tokenized.
            morfeusz (Morfeusz): a Morfeusz object that will be used for morphological analysis of the sentences.
            concraft (Concraft): a Concraft object that will be used for morphological disambiguation and annotation.
        '''
        self.all_conll_sents = []
        
        # retrieving the data sentence by sentence
        for i, sent in enumerate(annotations.sentences):
            conll_sent = []
            # getting the Morfeusz + Concraft info
            dag = morfeusz.analyse(sent)
            dag_disamb = concraft.disamb(dag)
            best_inter = best_interpretation(dag_disamb)
            # retrieving the manual annotations as well as tokens corresponding to the annotation
            ann = annotations.simple_gold_standard_tokenized[i]
            tokens = annotations.simple_sentences_tokenized[i]
            # setting up the offset that will be used for situations where tokens that were split in manual annotation were
            # not split in the machine one
            offset = 0
            
            # retrieving the data word by word
            for j, inter in enumerate(best_inter):
                # defining the index, retrieving the word as detected by Morfeusz
                idx = str(j + 1)
                form = inter[2][0]
                # excluding mistakenly detected ś tokens (that were not even split from the preceding word)    
                if form == "ś":
                    if best_inter[j-1][2][0].endswith('ś'):
                        offset += -1
                        continue
                # retrieving the lemma
                if len(inter[2][1]) > 1:
                    lemma = inter[2][1].split(':')[0]
                else:  # for when the lemma is just ':'
                    lemma = inter[2][1]
                # retrieving the UPOS tag for the word from the manual annotation, updating the offset accordingly
                try:
                    if tokens[j+offset] == form:
                        upos = ann[j+offset]      
                    elif tokens[j+offset] + tokens[j+offset+1] == form:
                        upos = ann[j+offset]
                        offset += 1
                    elif tokens[j+offset] + tokens[j+offset+1] + tokens[j+offset+2] == form:
                        upos = ann[j+offset]
                        offset += 2
                    else:    
                        upos = '_'
                except IndexError:
                    continue
                
                # retrieving the XPOS tag
                xpos = inter[2][2]
                
                # lowercasing the lemmas to match the UD standard
                if upos != 'PROPN':
                    lemma = lemma.lower()
                
                # creating a ConlluWord object to store the retrieved information, appending it to a temporary sentence list
                word = ConlluWord(idx, form, lemma, upos, xpos)
                conll_sent.append(word)
            
            # handling of compounded elements (only the ones marked with 'aglt' in XPOS are displayed this way by UD)
            tracker = []
            for j, word in enumerate(conll_sent):
                if word.XPOS.startswith('aglt') and word.UPOS == 'AUX':
                    if conll_sent[j+1].XPOS.startswith('aglt') and word.UPOS == 'AUX':
                        tracker.append(
                            (j-1, 
                             str(j)+'-'+str(j+2), 
                             conll_sent[j-1].FORM+word.FORM+conll_sent[j+1].FORM)
                        ) 
                    else:  # only 2 words connected
                        tracker.append((j-1, str(j)+'-'+str(j+1), conll_sent[j-1].FORM+word.FORM))
            # adding the additional entries
            for j, entry in reversed(list(enumerate(tracker))):
                word = ConlluWord(entry[1], entry[2], '_', '_', '_')
                conll_sent.insert(entry[0], word)
            
            # creating a ConlluSentence object, appending it to the internal list of all sentences
            full_sent = ConlluSentence('# sent_id = ' + str(i+1), '# text = ' + sent, conll_sent)
            self.all_conll_sents.append(full_sent)
            
    def __len__(self):
        '''A method of the class that returns the length of the internal storage of ConLLu sentences.
        '''
        return len(self.all_conll_sents)
    
    def __getitem__(self, index: int):
        '''A method of the class that returns the transformed sentence at a given index.
        
        Args:
            index (int): The index of the desired element.
        '''
        return self.all_conll_sents[index]
    
    def print_item(self, index: int):
        '''A method of the class that prints out the sentence at a given index.
        
        Args:
            index (int): The index of the desired element.
        '''
        print(self.all_conll_sents[index].return_sent())
    
    def print_conllu(self):
        '''A method of the class indended for printing out all of the annotation in the ConLLu format.
        '''
        for sentence in self.all_conll_sents:
            print(sentence.return_sent())
            print('\n')
        
    def write_conllu_2_file(self, filename: str):
        '''A method of the class indended for displaying saving all of the annotation in the ConLLu format.
        
        Args:
            filename (str): The name of the file the data should be saved to.
        '''      
        with open(filename, 'w') as f:
            for sentence in self.all_conll_sents:
                f.write(sentence.return_sent() + '\n\n')

### EXECUTION

In [None]:
original_text = OriginalAnnotations(file, lowercase=False)
#print(original_text.sentences)

In [None]:
formatter = ConlluFormatter(original_text)

In [None]:
formatter.print_conllu()

In [None]:
formatter.write_conllu_2_file('../data/memoirs_10k.conllu')