# STANZA LEMMATIZATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import stanza
import sklearn.metrics
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
file_xpos = '../data/memoirs_3k_corrected.conllu'
file_upos = '../data/memoirs_10k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [3]:
from functions import *
from preproc_bert import remove_ranges

### FUNCTIONS

In [4]:
def get_stanza_anns(sentences: list, processors: str, tag_type: str):
    '''A function that obtains and processes Stanza lemmatization annotations.
    
    Args:
        sentences (list): A list of lists of tokenized sentences.
        processors (str): The kind of processing that is desired as per Stanza documentation
        tag_type (str): The type of tag that should get retrieved.
        
    Returns:
        A list lists representing the Stanza lemmatization annotations.
    '''
    # defining the stanza pipeline
    nlp = stanza.Pipeline(lang='pl', processors=processors, tokenize_pretokenized=True)
    # getting stanza annotations
    annotations = []
    for i, sent in enumerate(tqdm(sentences, desc='Retrieving annotations per sentence...')):
        sent = ' '.join(sent)
        pred = nlp(sent)
        sent_annotations = []
        # getting out the lemmas
        for entry in pred.to_dict()[0]:
            sent_annotations.append(entry[tag_type])
        annotations.append(sent_annotations)
            
    return annotations

### EXECUTION - MODERN

In [5]:
test_tokens_upos, _ = extract_conllu_data(test_file, 'upos', sentences=True, combined=True)
test_tokens_xpos, _ = extract_conllu_data(test_file, 'xpos', sentences=True, combined=True)
test_tokens_lemmas, _ = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
test_tokens, test_upos = make_tagger_friendly(test_tokens_upos)
_, test_xpos = make_tagger_friendly(test_tokens_xpos)
_, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [6]:
test_lemma_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,lemma', 'lemma')

2023-03-24 17:35:45 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-24 17:35:45 INFO: Use device: cpu
2023-03-24 17:35:45 INFO: Loading: tokenize
2023-03-24 17:35:45 INFO: Loading: mwt
2023-03-24 17:35:45 INFO: Loading: lemma
2023-03-24 17:35:45 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|██████████████████████████████████████| 2215/2215 [00:21<00:00, 104.04it/s]


In [7]:
test_upos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'upos')

2023-03-24 17:36:07 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-24 17:36:07 INFO: Use device: cpu
2023-03-24 17:36:07 INFO: Loading: tokenize
2023-03-24 17:36:07 INFO: Loading: mwt
2023-03-24 17:36:07 INFO: Loading: pos
2023-03-24 17:36:07 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [01:03<00:00, 35.06it/s]


In [8]:
test_xpos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'xpos')

2023-03-24 17:37:10 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-24 17:37:10 INFO: Use device: cpu
2023-03-24 17:37:10 INFO: Loading: tokenize
2023-03-24 17:37:10 INFO: Loading: mwt
2023-03-24 17:37:10 INFO: Loading: pos
2023-03-24 17:37:11 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [01:09<00:00, 32.02it/s]


In [9]:
get_lemma_measures(test_lemmas, test_lemma_annotations)

Accuracy: 90.83%


In [10]:
get_measures(test_upos, test_upos_annotations, details=True)

MEASURES:
Accuracy: 98.40%
Precision (weighted): 98.41%
Recall (weighted): 98.40%
F1 (weighted): 98.40%
Matthew's Correlation Coefficient: 98.16%

MEASURES PER CLASS:
Precision:
	ADJ: 98.17%
	ADP: 99.46%
	ADV: 94.58%
	AUX: 95.44%
	CCONJ: 95.47%
	DET: 98.00%
	INTJ: 100.00%
	NOUN: 99.17%
	NUM: 98.48%
	PART: 95.01%
	PRON: 98.63%
	PROPN: 94.14%
	PUNCT: 99.95%
	SCONJ: 95.86%
	SYM: 100.00%
	VERB: 99.20%
	X: 93.53%
Recall:
	ADJ: 98.99%
	ADP: 99.91%
	ADV: 96.06%
	AUX: 97.14%
	CCONJ: 96.17%
	DET: 98.47%
	INTJ: 50.00%
	NOUN: 98.70%
	NUM: 98.11%
	PART: 90.97%
	PRON: 98.87%
	PROPN: 96.51%
	PUNCT: 99.95%
	SCONJ: 94.61%
	SYM: 25.00%
	VERB: 98.66%
	X: 93.53%



In [11]:
get_measures(test_xpos, test_xpos_annotations)

MEASURES:
Accuracy: 94.29%
Precision (weighted): 94.25%
Recall (weighted): 94.29%
F1 (weighted): 94.09%
Matthew's Correlation Coefficient: 94.05%


### EXECUTION - HISTORICAL

In [12]:
tokens_upos, _ = extract_conllu_data(file_upos, 'upos', sentences=True, combined=True)
tokens_xpos, _ = extract_conllu_data(file_xpos, 'xpos', sentences=True, combined=True)
tokens_lemmas, _ = extract_conllu_data(file_xpos, 'lemma', sentences=True, combined=True)

tokens_10k, upos = make_tagger_friendly(tokens_upos)
tokens_3k, xpos = make_tagger_friendly(tokens_xpos)
_, lemmas = make_tagger_friendly(tokens_lemmas)

In [13]:
lemma_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,lemma', 'lemma')

2023-03-24 17:38:22 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-24 17:38:22 INFO: Use device: cpu
2023-03-24 17:38:22 INFO: Loading: tokenize
2023-03-24 17:38:22 INFO: Loading: mwt
2023-03-24 17:38:22 INFO: Loading: lemma
2023-03-24 17:38:22 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 115/115 [00:02<00:00, 52.97it/s]


In [14]:
upos_annotations = get_stanza_anns(tokens_10k, 'tokenize,mwt,pos', 'upos')

2023-03-24 17:38:24 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-24 17:38:24 INFO: Use device: cpu
2023-03-24 17:38:24 INFO: Loading: tokenize
2023-03-24 17:38:24 INFO: Loading: mwt
2023-03-24 17:38:24 INFO: Loading: pos
2023-03-24 17:38:24 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 360/360 [00:13<00:00, 26.85it/s]


In [15]:
xpos_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,pos', 'xpos')

2023-03-24 17:38:38 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-24 17:38:38 INFO: Use device: cpu
2023-03-24 17:38:38 INFO: Loading: tokenize
2023-03-24 17:38:38 INFO: Loading: mwt
2023-03-24 17:38:38 INFO: Loading: pos
2023-03-24 17:38:38 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 115/115 [00:04<00:00, 26.69it/s]


In [16]:
get_lemma_measures(lemmas, lemma_annotations)

Accuracy: 82.76%


In [17]:
comparison = get_lemma_comparison(lemmas, lemma_annotations, tokens_3k)
comparison.to_excel('../data/mistakes/stanza_lemma_mistakes.xlsx')

In [18]:
comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Godziszewo,wsi Godziszewo parafii,Godziszewo,godziszewo
1,Rypnin,parafii Rypnin syn,Rypnin,rypnin
2,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,dobrzyńska,dobrrzyńska
3,ś,bratem ś .,ś,być
4,p,. p .,p,pan
5,Melchiora,. Melchiora –,Melchior,melchior
6,wdokumentach,później wdokumentach się,dokument,wdokument
7,Ewy,matki Ewy z,Ewa,ew
8,Pinińskich,z Pinińskich właścicieli,Piniński,piniński
9,Dóbr,właścicieli Dóbr Strużewo,dobra,dzbr


In [19]:
get_measures(upos, upos_annotations, details=True)

MEASURES:
Accuracy: 92.87%
Precision (weighted): 93.07%
Recall (weighted): 92.87%
F1 (weighted): 92.86%
Matthew's Correlation Coefficient: 91.93%

MEASURES PER CLASS:
Precision:
	ADJ: 87.84%
	ADP: 99.49%
	ADV: 89.63%
	AUX: 81.20%
	CCONJ: 98.33%
	DET: 92.01%
	NOUN: 95.06%
	NUM: 96.36%
	PART: 92.12%
	PRON: 90.08%
	PROPN: 78.77%
	PUNCT: 99.59%
	SCONJ: 85.91%
	VERB: 93.37%
	X: 73.58%
Recall:
	ADJ: 93.89%
	ADP: 98.82%
	ADV: 86.73%
	AUX: 85.04%
	CCONJ: 97.60%
	DET: 79.33%
	NOUN: 93.02%
	NUM: 80.30%
	PART: 73.43%
	PRON: 90.61%
	PROPN: 91.45%
	PUNCT: 100.00%
	SCONJ: 94.50%
	VERB: 93.29%
	X: 54.93%



In [21]:
upos_comparison = get_comparison(upos, upos_annotations, tokens_10k)
upos_comparison.to_excel('../data/mistakes/stanza_UPOS_mistakes.xlsx')

In [22]:
upos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Komornika,Jana Komornika ziemi,NOUN,PROPN
1,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,ADJ,PROPN
2,ś,bratem ś .,X,AUX
3,Pinińskich,z Pinińskich właścicieli,PROPN,ADJ
4,Dąbrowy,", Dąbrowy części",PROPN,ADJ
5,śp,że śp Dziad,X,PROPN
6,Dziad,śp Dziad mój,NOUN,PROPN
7,Dobrzyńskiej,ziemi Dobrzyńskiej za,PROPN,ADJ
8,Byli,. Byli jeszcze,VERB,AUX
9,Panna,siostra Panna Urszula,NOUN,PROPN


In [23]:
get_measures(xpos, xpos_annotations)

MEASURES:
Accuracy: 84.50%
Precision (weighted): 85.14%
Recall (weighted): 84.50%
F1 (weighted): 84.23%
Matthew's Correlation Coefficient: 84.00%


In [24]:
xpos_comparison = get_comparison(xpos, xpos_annotations, tokens_3k)
xpos_comparison.to_excel('../data/mistakes/stanza_XPOS_mistakes.xlsx')

In [25]:
xpos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,urodzony,Melchior urodzony roku,adj:sg:nom:m1:pos,ppas:sg:nom:m1:perf:aff
1,Godziszewo,wsi Godziszewo parafii,subst:sg:nom:n:ncol,subst:sg:nom:m1
2,parafii,Godziszewo parafii Rypnin,subst:sg:loc:f,subst:sg:gen:f
3,Rypnin,parafii Rypnin syn,subst:sg:nom:m3,subst:sg:nom:m1
4,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,adj:sg:gen:f:pos,subst:sg:gen:f
5,ś,bratem ś .,brev:pun,aglt:sg:sec:imperf:nwok
6,p,. p .,brev:npun,brev:pun
7,Pinińskich,z Pinińskich właścicieli,subst:pl:gen:m1,adj:pl:gen:m1:pos
8,Strużewo,Dóbr Strużewo z,subst:sg:nom:n:ncol,subst:sg:gen:n
9,Dąbrowy,", Dąbrowy części",subst:sg:gen:f,adj:sg:gen:f:pos
