# STANZA LEMMATIZATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import stanza
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

pd.set_option('display.max_rows', None)

In [2]:
file_xpos = '../data/memoirs_3k_corrected.conllu'
file_upos = '../data/memoirs_10k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

### FUNCTIONS

In [3]:
def get_stanza_anns(sentences: list, processors: str, tag_type: str):
    '''A function that obtains and processes Stanza lemmatization annotations.
    
    Args:
        sentences (list): A list of lists of tokenized sentences.
        processors (str): The kind of processing that is desired as per Stanza documentation
        tag_type (str): The type of tag that should get retrieved.
        
    Returns:
        A list lists representing the Stanza lemmatization annotations.
    '''
    # defining the stanza pipeline
    nlp = stanza.Pipeline(lang='pl', processors=processors, tokenize_pretokenized=True)
    # getting stanza annotations
    annotations = []
    for i, sent in enumerate(tqdm(sentences, desc='Retrieving annotations per sentence...')):
        sent = ' '.join(sent)
        pred = nlp(sent)
        sent_annotations = []
        # getting out the lemmas
        for entry in pred.to_dict()[0]:
            sent_annotations.append(entry[tag_type])
        annotations.append(sent_annotations)
            
    return annotations

In [4]:
def remove_ranges(tokens: list):
    '''A function that removes the "range" elements in a list based off of a conllu file. This is so as to
    exclude situations where in the list of tokens in a sentence one gets both "zrobiłem" and "zrobił" + "em".
    
    Args:
        tokens (list[str]): A list of token-tag pairs.

    Returns:
        A list of token-tag pairs with the elements without a tag (with "_" instead of it) are excluded.
    '''
    tokens = [x for x in tokens if ' _' not in x]
    return tokens

In [5]:
def extract_conllu_data(filename: str, feature: str, sentences: bool = True, combined: bool = False, fulltext: bool = True):
    '''A function that allows for the extraction of the desired data from a conllu file, structured into sentences or not.
    
    Args:
        filename (str): The name of the .conllu file to be read.
        feature (str): The name of the desired conllu format feature.
        sentences (bool): Whether or not the output should be a list of lists of strings representing words in separate sentences.
        combined (bool): Whether or not the tokens and tags should be returned in one list of space-separated elements.
        fulltext (bool): Whether or not to extract and return the metadata sentences.
        
    Returns:
        A list of the original tokens (tokenized sentences), a list of the corresponding features, and a list of full original 
        sentences.
    '''
    #checking the validity of the feature argument
    possible_features = ['lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    if feature not in possible_features:
        print('Please specify a valid feature type.')
        return
    
    # specifying lists
    tokens_features = []
    tokens = []
    features = []
    data = []
    
    # opening the file
    with open(filename) as f:
        text = f.read()
    
    # parsing the file
    sents = conllu.parse(text)
    
    # selecting the relevant data and adding it to relevant lists
    for sentence in sents:
        if fulltext:
            data.append(sentence.metadata['text'])
        sent_tokens_features = []
        sent_tokens = []
        sent_features = []
        for entry in sentence:
            token = entry['form']
            feat = entry[feature]
            
            sent_tokens.append(token)
            sent_features.append(feat)
            
            if combined:  # this will return a different data structure
                if feat == None:
                    feat = '_'
                sent_tokens_features.append(' '.join([token, feat]))
            
            
        tokens.append(sent_tokens)
        features.append(sent_features)
        
        if combined:
            tokens_features.append(sent_tokens_features)
    
    # unravelling the sentence-level lists if needed
    if not sentences:
        tokens = [x for sentence in tokens for x in sentence]
        features = [x for sentence in features for x in sentence]
        if combined:
            tokens_features = [x for sentence in tokens_features for x in sentence]
            
    if combined:
        if fulltext:
            return tokens_features, data
        else:
            return tokens_features
    else:
        if fulltext:        
            return tokens, features, data
        else:
            return tokens, features

In [6]:
def get_measures(gold_standard: list, predictions: list, labels: list = [], matrix: bool = False, details: bool = False):
    '''A function intended for retrieving a selection of evaluation measures for comparing the gold standard and the tagger
    annotations. The measures are printed out and include accuracy, Matthew's Correlation Coefficient, per-class precision 
    and recall, as well as a confusion matrix, which, in addition, get saved locally. These measures are calculated using 
    functions from sklearn and pyplot.
    
    Args:
        gold_standard (list[str]): A list of gold standard labels.
        predictions (list[str]): A list of predicted labels.
        labels (list[str]): A list of labels (if it needs to be specified).
        matrix (bool): Whether or not to produce a confusion matrix.
    '''
    
    if isinstance(gold_standard[0], list):
        gold_standard = [x for sentence in gold_standard for x in sentence]
    if isinstance(predictions[0], list):
        predictions = [x for sentence in predictions for x in sentence]

    if labels == []:  # setting up a list of labels based on the training data
        labels = sorted(list(set(gold_standard)))
    else:
        if isinstance(labels[0], list):
            labels = [x for sentence in labels for x in sentence]

    # printing out the measures
    print('MEASURES:')
    print(f'Accuracy: {"{:.2%}".format(sklearn.metrics.accuracy_score(gold_standard, predictions))}')
    print(f'Precision (weighted): {"{:.2%}".format(sklearn.metrics.precision_score(gold_standard, predictions, average="weighted", zero_division=0))}')
    print(f'Recall (weighted): {"{:.2%}".format(sklearn.metrics.recall_score(gold_standard, predictions, average="weighted", zero_division=0))}')
    print(f'F1 (weighted): {"{:.2%}".format(sklearn.metrics.f1_score(gold_standard, predictions, average="weighted", zero_division=0))}')
    print(f'Matthew\'s Correlation Coefficient: {"{:.2%}".format(sklearn.metrics.matthews_corrcoef(gold_standard, predictions))}')
    if details:
        print()
        print('MEASURES PER CLASS:')
        precision = sklearn.metrics.precision_score(gold_standard, predictions, average=None, labels=labels, zero_division=0)
        print('Precision:')
        for i in range(0,len(labels)):
            print(f'\t{labels[i]}: {"{:.2%}".format(precision[i])}')
        recall = sklearn.metrics.recall_score(gold_standard, predictions, average=None, labels=labels, zero_division=0)
        print('Recall:')
        for i in range(0,len(labels)):
            print(f'\t{labels[i]}: {"{:.2%}".format(recall[i])}')
        print()
    
    # printing out and saving the confusion matrix
    if matrix:
        print('Confusion matrix:')
        cm = sklearn.metrics.confusion_matrix(gold_standard, predictions)
        matrix = sklearn.metrics.ConfusionMatrixDisplay(cm, display_labels=labels)
        fig, ax = plt.subplots(figsize=(12,12))
        matrix.plot(ax=ax)
        
        timestr = time.strftime("%Y%m%d-%H%M%S")
        plt.savefig(timestr + "confusion_matrix.jpg")

In [7]:
def get_comparison(standard: list, predictions: list, tokens: list, confidence=[]):
    '''A function that returns a comparison of where mistakes were made during annotation.
    
    Args:
        standard (list): A list of gold standard annotations.
        predictions (list): A list of predicted annotations.
        tokens (list): A list of original tokens corresponding to the tags.
    
    Returns:
        A Pandas dataframe containing the mismatched annotations, their context and tokens.
    '''

    if isinstance(standard[0], list):
        standard = [x for sentence in standard for x in sentence]
    if isinstance(predictions[0], list):
        predictions = [x for sentence in predictions for x in sentence]
    if isinstance(tokens[0], list):
        tokens = [x for sentence in tokens for x in sentence]
    
    problematic = []
    for i, ann in enumerate(predictions):
        if standard[i] != ann:
            if i != 0:
                preceding = tokens[i-1]
            else:
                preceding = ''
                
            if i != len(tokens)-1:
                succeeding = tokens[i+1]
            else:
                succeeding = ''
            
            if not confidence:
                problematic.append((tokens[i], ' '.join([preceding, tokens[i], succeeding]), standard[i], predictions[i]))
            else:
                if isinstance(confidence[0], list):
                    confidence = [x for sentence in confidence for x in sentence]
                problematic.append((tokens[i], ' '.join([preceding, tokens[i], succeeding]), standard[i], predictions[i], confidence[i]))
    if not confidence:        
        problematic_frame = pd.DataFrame(problematic, columns=['Token', 'Context', 'Gold Standard', 'Prediction'])
    else:
        problematic_frame = pd.DataFrame(problematic, columns=['Token', 'Context', 'Gold Standard', 'Prediction', 'Confidence'])
    
    return problematic_frame

In [8]:
def get_lemma_measures(standard: list, predictions: list, lowercase: bool = False):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
        lowercase (bool): Whether or not both of the lists should be lowercased.
    '''
    if lowercase:
        standard = [x.lower() for sentence in standard for x in sentence]
        predictions = [x.lower() for sentence in predictions for x in sentence]
    else:
        standard = [x for sentence in standard for x in sentence]
        predictions = [x for sentence in predictions for x in sentence]

    print(f'Accuracy: {"{:.2%}".format(sklearn.metrics.accuracy_score(standard, predictions))}')

In [9]:
def get_lemma_comparison(standard: list, predictions: list, tokens: list, lowercase: bool = False):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
        tokens (list): A list of lists of the tokens to be lemmatized.
        lowercase (bool): Whether or not standard and predictions should be lowercased.
    
    Returns:
        A Pandas dataframe containing the mismatched lemmas.
    '''
    tokens = [x for sentence in tokens for x in sentence]

    if lowercase:
        standard = [x.lower() for sentence in standard for x in sentence]
        predictions = [x.lower() for sentence in predictions for x in sentence]
    else:
        standard = [x for sentence in standard for x in sentence]
        predictions = [x for sentence in predictions for x in sentence]
            
    problematic_frame = get_comparison(standard, predictions, tokens)
    
    return problematic_frame

In [10]:
def make_tagger_friendly(tokens_tags):
    '''A function allowing for the use of split_tags_and_tokens and remove_ranges on nested lists.
    
    Arguments:
        token_tags (list[list]): A list of lists representing sentences with annotations.
        
    Returns:
        Two separate lists of lists representing sentences and their annotations respectively.'''
    tokens = []
    tags = []
    for element in tokens_tags:
        mini_tokens, mini_tags = split_tags_and_tokens(remove_ranges(element))
        tokens.append(mini_tokens)
        tags.append(mini_tags)
        
    return tokens, tags

In [11]:
def split_tags_and_tokens(tags: list):
    '''A function that splits every entry in a list by whitespace and into two separate lists.
    
    Args:
        tags (list): A list where every entry is a string containing whitespace.
        
    Returns:
        Two lists, containing the first and the second element of every entry from the original list.
    '''
    tokens = [x.strip().split()[0] for x in tags if len(x.strip()) > 1]
    tags = [(' ').join(x.strip().split()[1:]) for x in tags if len(x.strip()) > 1]

    return tokens, tags

In [12]:
def get_full_table(standard: list, predictions: list, tokens: list, confidence=[], lowercase: bool = False):
    '''A function that returns a list of all the tokens with their predictions, gold standard, and context.
    
    Args:
        standard (list): A list of gold standard annotations.
        predictions (list): A list of predicted annotations.
        tokens (list): A list of original tokens corresponding to the tags.
        confidence (list): A list of prediction confidences, if available; empty by default.
        lowercase (bool): Whether or not standard and predictions should be lowercased.
    
    Returns:
        A Pandas dataframe containing the mismatched annotations, their context and tokens.
    '''

    if isinstance(standard[0], list):
        standard = [x for sentence in standard for x in sentence]
    if isinstance(predictions[0], list):
        predictions = [x for sentence in predictions for x in sentence]
    if isinstance(tokens[0], list):
        tokens = [x for sentence in tokens for x in sentence]

    if lowercase:
        standard = [x.lower() for x in standard]
        predictions = [x.lower() for x in predictions]
    
    all_entries = []
    for i, ann in enumerate(predictions):
        if i != 0:
            preceding = tokens[i-1]
        else:
            preceding = ''
                
        if i != len(tokens)-1:
            succeeding = tokens[i+1]
        else:
            succeeding = ''
            
        if not confidence:
            all_entries.append((tokens[i], ' '.join([preceding, tokens[i], succeeding]), standard[i], predictions[i]))
        else:
            if isinstance(confidence[0], list):
                confidence = [x for sentence in confidence for x in sentence]
            all_entries.append((tokens[i], ' '.join([preceding, tokens[i], succeeding]), standard[i], predictions[i], confidence[i]))
    if not confidence:        
        problematic_frame = pd.DataFrame(all_entries, columns=['Token', 'Context', 'Gold Standard', 'Prediction'])
    else:
        problematic_frame = pd.DataFrame(all_entries, columns=['Token', 'Context', 'Gold Standard', 'Prediction', 'Confidence'])
    
    return problematic_frame

### EXECUTION - MODERN

In [13]:
test_tokens_upos, _ = extract_conllu_data(test_file, 'upos', sentences=True, combined=True)
test_tokens_xpos, _ = extract_conllu_data(test_file, 'xpos', sentences=True, combined=True)
test_tokens_lemmas, _ = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
test_tokens, test_upos = make_tagger_friendly(test_tokens_upos)
_, test_xpos = make_tagger_friendly(test_tokens_xpos)
_, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [14]:
test_lemma_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,lemma', 'lemma')

2023-06-14 14:46:58 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-06-14 14:46:58 INFO: Use device: cpu
2023-06-14 14:46:58 INFO: Loading: tokenize
2023-06-14 14:46:58 INFO: Loading: mwt
2023-06-14 14:46:58 INFO: Loading: lemma
2023-06-14 14:46:58 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [00:23<00:00, 95.95it/s]


In [15]:
test_upos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'upos')

2023-06-14 14:47:21 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-06-14 14:47:21 INFO: Use device: cpu
2023-06-14 14:47:21 INFO: Loading: tokenize
2023-06-14 14:47:21 INFO: Loading: mwt
2023-06-14 14:47:22 INFO: Loading: pos
2023-06-14 14:47:22 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [01:20<00:00, 27.42it/s]


In [16]:
test_xpos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'xpos')

2023-06-14 14:48:43 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-06-14 14:48:43 INFO: Use device: cpu
2023-06-14 14:48:43 INFO: Loading: tokenize
2023-06-14 14:48:43 INFO: Loading: mwt
2023-06-14 14:48:43 INFO: Loading: pos
2023-06-14 14:48:43 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [01:16<00:00, 28.81it/s]


In [17]:
get_lemma_measures(test_lemmas, test_lemma_annotations)

Accuracy: 90.89%


In [18]:
get_lemma_measures(test_lemmas, test_lemma_annotations, lowercase=True)

Accuracy: 92.34%


In [19]:
get_measures(test_upos, test_upos_annotations, details=True)

MEASURES:
Accuracy: 98.40%
Precision (weighted): 98.41%
Recall (weighted): 98.40%
F1 (weighted): 98.40%
Matthew's Correlation Coefficient: 98.16%

MEASURES PER CLASS:
Precision:
	ADJ: 98.17%
	ADP: 99.46%
	ADV: 94.58%
	AUX: 95.44%
	CCONJ: 95.47%
	DET: 98.00%
	INTJ: 100.00%
	NOUN: 99.17%
	NUM: 98.48%
	PART: 95.01%
	PRON: 98.63%
	PROPN: 94.14%
	PUNCT: 99.95%
	SCONJ: 95.86%
	SYM: 100.00%
	VERB: 99.20%
	X: 93.53%
Recall:
	ADJ: 98.99%
	ADP: 99.91%
	ADV: 96.06%
	AUX: 97.14%
	CCONJ: 96.17%
	DET: 98.47%
	INTJ: 50.00%
	NOUN: 98.70%
	NUM: 98.11%
	PART: 90.97%
	PRON: 98.87%
	PROPN: 96.51%
	PUNCT: 99.95%
	SCONJ: 94.61%
	SYM: 25.00%
	VERB: 98.66%
	X: 93.53%



In [20]:
get_measures(test_xpos, test_xpos_annotations)

MEASURES:
Accuracy: 94.29%
Precision (weighted): 94.25%
Recall (weighted): 94.29%
F1 (weighted): 94.09%
Matthew's Correlation Coefficient: 94.05%


### EXECUTION - HISTORICAL

In [21]:
tokens_upos, _ = extract_conllu_data(file_upos, 'upos', sentences=True, combined=True)
tokens_xpos, _ = extract_conllu_data(file_xpos, 'xpos', sentences=True, combined=True)
tokens_lemmas, _ = extract_conllu_data(file_xpos, 'lemma', sentences=True, combined=True)

tokens_10k, upos = make_tagger_friendly(tokens_upos)
tokens_3k, xpos = make_tagger_friendly(tokens_xpos)
_, lemmas = make_tagger_friendly(tokens_lemmas)

In [None]:
lemma_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,lemma', 'lemma')

2023-06-14 14:50:02 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-06-14 14:50:02 INFO: Use device: cpu
2023-06-14 14:50:02 INFO: Loading: tokenize
2023-06-14 14:50:02 INFO: Loading: mwt
2023-06-14 14:50:02 INFO: Loading: lemma
2023-06-14 14:50:02 INFO: Done loading processors!
Retrieving annotations per sentence...:  72%|██████████████████████████████▎           | 83/115 [00:01<00:00, 53.80it/s]

In [None]:
upos_annotations = get_stanza_anns(tokens_10k, 'tokenize,mwt,pos', 'upos')

In [None]:
xpos_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,pos', 'xpos')

In [None]:
get_lemma_measures(lemmas, lemma_annotations)

In [None]:
get_lemma_measures(lemmas, lemma_annotations, lowercase=True)

In [None]:
full_lemmas = get_full_table(lemmas, lemma_annotations, tokens_3k)
full_lemmas.to_excel('../data/results/stanza_lemmas.xlsx')

full_lemmas_lowercase = get_full_table(lemmas, lemma_annotations, tokens_3k, lowercase=True)
full_lemmas_lowercase.to_excel('../data/results/stanza_lowercase_lemmas.xlsx')

In [None]:
comparison = get_lemma_comparison(lemmas, lemma_annotations, tokens_3k)
comparison.to_excel('../data/mistakes/stanza_lemma_mistakes.xlsx')

comparison_lowercase = get_lemma_comparison(lemmas, lemma_annotations, tokens_3k, lowercase=True)
comparison_lowercase.to_excel('../data/mistakes/stanza_lowercase_lemma_mistakes.xlsx')

In [None]:
comparison

In [None]:
comparison_lowercase

In [None]:
get_measures(upos, upos_annotations, details=True)

In [None]:
full_upos = get_full_table(upos, upos_annotations, tokens_10k)
full_upos.to_excel('../data/results/stanza_upos.xlsx')

In [None]:
upos_comparison = get_comparison(upos, upos_annotations, tokens_10k)
upos_comparison.to_excel('../data/mistakes/stanza_UPOS_mistakes.xlsx')

In [None]:
upos_comparison

In [None]:
get_measures(xpos, xpos_annotations)

In [None]:
full_xpos = get_full_table(xpos, xpos_annotations, tokens_3k)
full_xpos.to_excel('../data/results/stanza_xpos.xlsx')

In [None]:
xpos_comparison = get_comparison(xpos, xpos_annotations, tokens_3k)
xpos_comparison.to_excel('../data/mistakes/stanza_XPOS_mistakes.xlsx')

In [None]:
xpos_comparison