# STANZA LEMMATIZATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import stanza
import sklearn.metrics
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
file = '../data/memoirs_3k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [3]:
from functions import extract_conllu_data, get_comparison

### FUNCTIONS

```def extract_conllu_data(filename: str, feature: str, sentences: bool = True):
    '''A function that allows for the extraction of the desired data from a conllu file, structured into sentences or not.
    
    Args:
        filename (str): The name of the .conllu file to be read.
        feature (str): The name of the desired conllu format feature.
        sentences (bool): Whether or not the output should be a list of lists of strings representing words in separate sentences.
        
    Returns:
        A list of the original tokens (tokenized sentences), a list of the corresponding features, and a list of full original 
        sentences.
    '''
    #checking the validity of the feature argument
    possible_features = ['lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    if feature not in possible_features:
        print('Please specify a valid feature type.')
        return
    
    # specifying lists
    tokens = []
    features = []
    data = []
    
    with open(filename) as f:
        text = f.read()
    
    sents = conllu.parse(text)
    
    for sentence in sents:
        data.append(sentence.metadata['text'])
        sent_tokens = []
        sent_features = []
        for entry in sentence:
            token = entry['form']
            feat = entry[feature]
            
            sent_tokens.append(token)
            sent_features.append(feat)
            
            
        tokens.append(sent_tokens)
        features.append(sent_features)
            
    if not sentences:
        tokens = [x for sentence in tokens for x in sentence]
        features = [x for sentence in features for x in sentence]
        
    return tokens, features, data```

In [4]:
def get_stanza_anns(sentences: list):
    '''A function that obtains and processes Stanza lemmatization annotations.
    
    Args:
        sentences (list): A list of lists of tokenized sentences.
        
    Returns:
        A list lists representing the Stanza lemmatization annotations.
    '''
    # defining the stanza pipeline
    nlp = stanza.Pipeline(lang='pl', processors='tokenize,lemma', tokenize_pretokenized=True)
    # getting stanza annotations
    annotations = []
    for i, sent in enumerate(sentences):
        sent = ' '.join(sent)
        pred = nlp(sent)
        sent_annotations = []
        # getting out the lemmas
        for entry in pred.to_dict()[0]:
            sent_annotations.append(entry['lemma'])
        annotations.append(sent_annotations)
            
    return annotations

In [5]:
def get_measures(standard: list, predictions: list, lowercase: bool = False):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
        lowercase (bool): Whether or not the data should be lowercased for comparison.
    '''
    if lowercase:
        standard = [x.lower() for sentence in standard for x in sentence]
        predictions = [x.lower() for sentence in predictions for x in sentence]
    else:
        standard = [x for sentence in standard for x in sentence]
        predictions = [x for sentence in predictions for x in sentence]
    print(f'Accuracy: {"{:.2%}".format(sklearn.metrics.accuracy_score(standard, predictions))}')

In [6]:
def get_stanza_comparison(standard: list, predictions: list, tokens: list, lowercase: bool = False):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
        lowercase (bool): Whether or not the data should be lowercased for comparison.
    
    Returns:
        A Pandas dataframe containing the mismatched lemmas.
    '''
    tokens = [x.lower() for sentence in tokens for x in sentence]
    
    if lowercase:
        standard = [x.lower() for sentence in standard for x in sentence]
        predictions = [x.lower() for sentence in predictions for x in sentence]
    else:
        standard = [x for sentence in standard for x in sentence]
        predictions = [x for sentence in predictions for x in sentence]
            
    problematic_frame = get_comparison(standard, predictions, tokens)
    
    return problematic_frame

### EXECUTION - MODERN

In [7]:
test_tokens, test_features, _ = extract_conllu_data(test_file, 'lemma', sentences=True)

In [8]:
test_annotations = get_stanza_anns(test_tokens)

2023-03-15 09:20:13 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-15 09:20:13 INFO: Use device: cpu
2023-03-15 09:20:13 INFO: Loading: tokenize
2023-03-15 09:20:13 INFO: Loading: mwt
2023-03-15 09:20:13 INFO: Loading: lemma
2023-03-15 09:20:14 INFO: Done loading processors!


In [9]:
get_measures(test_features, test_annotations)

Accuracy: 90.22%


### EXECUTION - HISTORICAL

In [10]:
tokens, features, _ = extract_conllu_data(file, 'lemma', sentences=True)

In [11]:
annotations = get_stanza_anns(tokens)

2023-03-15 09:20:34 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-15 09:20:34 INFO: Use device: cpu
2023-03-15 09:20:34 INFO: Loading: tokenize
2023-03-15 09:20:34 INFO: Loading: mwt
2023-03-15 09:20:34 INFO: Loading: lemma
2023-03-15 09:20:34 INFO: Done loading processors!


In [12]:
get_measures(features, annotations)

Accuracy: 82.65%


In [13]:
comparison = get_stanza_comparison(features, annotations, tokens)
comparison.to_excel('../data/mistakes/stanza_mistakes.xlsx')

In [14]:
comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,godziszewo,wsi godziszewo parafii,Godziszewo,godziszewo
1,rypnin,parafii rypnin syn,Rypnin,rypnin
2,dobrrzyńskiej,ziemi dobrrzyńskiej (,dobrzyńska,dobrrzyńska
3,ś,bratem ś .,ś,być
4,p,. p .,p,pan
5,melchiora,. melchiora –,Melchior,melchior
6,wdokumentach,później wdokumentach się,dokument,wdokument
7,ewy,matki ewy z,Ewa,ew
8,pinińskich,z pinińskich właścicieli,Piniński,piniński
9,dóbr,właścicieli dóbr strużewo,dobra,dzbr


In [15]:
get_measures(features, annotations, lowercase=True)

Accuracy: 85.49%


In [16]:
comparison_lower = get_stanza_comparison(features, annotations, tokens, lowercase=True)
comparison_lower.to_excel('../data/mistakes/stanza_mistakes_lowercase.xlsx')

In [17]:
comparison_lower

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,dobrrzyńskiej,ziemi dobrrzyńskiej (,dobrzyńska,dobrrzyńska
1,ś,bratem ś .,ś,być
2,p,. p .,p,pan
3,wdokumentach,później wdokumentach się,dokument,wdokument
4,ewy,matki ewy z,ewa,ew
5,dóbr,właścicieli dóbr strużewo,dobra,dzbr
6,odstąpił,– odstąpił swoją,odstąpić,odstąpił
7,najstarszemu,część najstarszemu bratu,stary,najstarsze
8,dobrzyńskiej,ziemi dobrzyńskiej za,dobrzyński,dobrzyńska
9,ośm,za ośm tysięcy,osiem,ośm
