# STANZA LEMMATIZATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import stanza
import sklearn.metrics
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
file = '../data/memoirs_3k_corrected.conllu'

In [3]:
from functions import extract_conllu_data

### FUNCTIONS

```def extract_conllu_data(filename: str, feature: str, sentences: bool = True):
    '''A function that allows for the extraction of the desired data from a conllu file, structured into sentences or not.
    
    Args:
        filename (str): The name of the .conllu file to be read.
        feature (str): The name of the desired conllu format feature.
        sentences (bool): Whether or not the output should be a list of lists of strings representing words in separate sentences.
        
    Returns:
        A list of the original tokens (tokenized sentences), a list of the corresponding features, and a list of full original 
        sentences.
    '''
    #checking the validity of the feature argument
    possible_features = ['lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    if feature not in possible_features:
        print('Please specify a valid feature type.')
        return
    
    # specifying lists
    tokens = []
    features = []
    data = []
    
    with open(filename) as f:
        text = f.read()
    
    sents = conllu.parse(text)
    
    for sentence in sents:
        data.append(sentence.metadata['text'])
        sent_tokens = []
        sent_features = []
        for entry in sentence:
            token = entry['form']
            feat = entry[feature]
            
            sent_tokens.append(token)
            sent_features.append(feat)
            
            
        tokens.append(sent_tokens)
        features.append(sent_features)
            
    if not sentences:
        tokens = [x for sentence in tokens for x in sentence]
        features = [x for sentence in features for x in sentence]
        
    return tokens, features, data```

In [4]:
def get_stanza_anns(sentences: list):
    '''A function that obtains and processes Stanza lemmatization annotations.
    
    Args:
        sentences (list): A list of lists of tokenized sentences.
        
    Returns:
        A list lists representing the Stanza lemmatization annotations.
    '''
    # defining the stanza pipeline
    nlp = stanza.Pipeline(lang='pl', processors='tokenize,lemma', tokenize_pretokenized=True)
    # getting stanza annotations
    annotations = []
    for i, sent in enumerate(sentences):
        sent = ' '.join(sent)
        pred = nlp(sent)
        sent_annotations = []
        # getting out the lemmas
        for entry in pred.to_dict()[0]:
            sent_annotations.append(entry['lemma'])
        annotations.append(sent_annotations)
            
    return annotations

In [5]:
def get_measures(standard: list, predictions: list):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
    '''
    standard = [x for sentence in standard for x in sentence]
    predictions = [x for sentence in predictions for x in sentence]
    print(f'Accuracy: {"{:.2%}".format(sklearn.metrics.accuracy_score(standard, predictions))}')

In [6]:
def get_comparison(standard: list, predictions: list):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
    
    Returns:
        A Pandas dataframe containing the mismatched lemmas.
    '''
    standard = [x for sentence in standard for x in sentence]
    predictions = [x for sentence in predictions for x in sentence]
    
    problematic = []
    for i, ann in enumerate(predictions):
        if standard[i] != ann:
            problematic.append((standard[i], predictions[i]))
            
    problematic_frame = pd.DataFrame(problematic, columns=['Gold Standard', 'Prediction'])
    
    return problematic_frame

### EXECUTION

In [7]:
tokens, features, data = extract_conllu_data(file, 'lemma', sentences=True)

In [8]:
annotations = get_stanza_anns(tokens)

2023-03-09 10:46:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-03-09 10:46:03 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-09 10:46:03 INFO: Use device: cpu
2023-03-09 10:46:03 INFO: Loading: tokenize
2023-03-09 10:46:03 INFO: Loading: mwt
2023-03-09 10:46:03 INFO: Loading: lemma
2023-03-09 10:46:03 INFO: Done loading processors!


In [9]:
get_measures(features, annotations)

Accuracy: 82.65%


In [10]:
comparison = get_comparison(features, annotations)
comparison.to_excel('../data/stanza_mistakes.xlsx')

In [11]:
comparison

Unnamed: 0,Gold Standard,Prediction
0,Godziszewo,godziszewo
1,Rypnin,rypnin
2,dobrzyńska,dobrrzyńska
3,ś,być
4,p,pan
5,Melchior,melchior
6,dokumenty,wdokument
7,Ewa,ew
8,Piniński,piniński
9,dobra,dzbr
