# STANZA LEMMATIZATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import stanza
import sklearn.metrics
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
file_xpos = '../data/memoirs_3k_corrected.conllu'
file_upos = '../data/memoirs_10k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [3]:
from functions import *
from preproc_bert import remove_ranges

### FUNCTIONS

In [4]:
def get_stanza_anns(sentences: list, processors: str, tag_type: str):
    '''A function that obtains and processes Stanza lemmatization annotations.
    
    Args:
        sentences (list): A list of lists of tokenized sentences.
        processors (str): The kind of processing that is desired as per Stanza documentation
        tag_type (str): The type of tag that should get retrieved.
        
    Returns:
        A list lists representing the Stanza lemmatization annotations.
    '''
    # defining the stanza pipeline
    nlp = stanza.Pipeline(lang='pl', processors=processors, tokenize_pretokenized=True)
    # getting stanza annotations
    annotations = []
    for i, sent in enumerate(tqdm(sentences, desc='Retrieving annotations per sentence...')):
        sent = ' '.join(sent)
        pred = nlp(sent)
        sent_annotations = []
        # getting out the lemmas
        for entry in pred.to_dict()[0]:
            sent_annotations.append(entry[tag_type])
        annotations.append(sent_annotations)
            
    return annotations

### EXECUTION - MODERN

In [5]:
test_tokens_upos, _ = extract_conllu_data(test_file, 'upos', sentences=True, combined=True)
test_tokens_xpos, _ = extract_conllu_data(test_file, 'xpos', sentences=True, combined=True)
test_tokens_lemmas, _ = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
test_tokens, test_upos = make_tagger_friendly(test_tokens_upos)
_, test_xpos = make_tagger_friendly(test_tokens_xpos)
_, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [6]:
test_lemma_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,lemma', 'lemma')

2023-03-29 22:48:33 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-29 22:48:33 INFO: Use device: cpu
2023-03-29 22:48:33 INFO: Loading: tokenize
2023-03-29 22:48:33 INFO: Loading: mwt
2023-03-29 22:48:33 INFO: Loading: lemma
2023-03-29 22:48:33 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [01:00<00:00, 36.68it/s]


In [7]:
test_upos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'upos')

2023-03-29 22:49:34 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-29 22:49:34 INFO: Use device: cpu
2023-03-29 22:49:34 INFO: Loading: tokenize
2023-03-29 22:49:34 INFO: Loading: mwt
2023-03-29 22:49:34 INFO: Loading: pos
2023-03-29 22:49:35 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [02:55<00:00, 12.65it/s]


In [8]:
test_xpos_annotations = get_stanza_anns(test_tokens, 'tokenize,mwt,pos', 'xpos')

2023-03-29 22:52:30 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-29 22:52:30 INFO: Use device: cpu
2023-03-29 22:52:30 INFO: Loading: tokenize
2023-03-29 22:52:30 INFO: Loading: mwt
2023-03-29 22:52:30 INFO: Loading: pos
2023-03-29 22:52:30 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|███████████████████████████████████████| 2215/2215 [02:53<00:00, 12.78it/s]


In [9]:
get_lemma_measures(test_lemmas, test_lemma_annotations)

Accuracy: 90.89%


In [10]:
get_measures(test_upos, test_upos_annotations, details=True)

MEASURES:
Accuracy: 98.40%
Precision (weighted): 98.41%
Recall (weighted): 98.40%
F1 (weighted): 98.40%
Matthew's Correlation Coefficient: 98.16%

MEASURES PER CLASS:
Precision:
	ADJ: 98.17%
	ADP: 99.46%
	ADV: 94.58%
	AUX: 95.44%
	CCONJ: 95.47%
	DET: 98.00%
	INTJ: 100.00%
	NOUN: 99.17%
	NUM: 98.48%
	PART: 95.01%
	PRON: 98.63%
	PROPN: 94.14%
	PUNCT: 99.95%
	SCONJ: 95.86%
	SYM: 100.00%
	VERB: 99.20%
	X: 93.53%
Recall:
	ADJ: 98.99%
	ADP: 99.91%
	ADV: 96.06%
	AUX: 97.14%
	CCONJ: 96.17%
	DET: 98.47%
	INTJ: 50.00%
	NOUN: 98.70%
	NUM: 98.11%
	PART: 90.97%
	PRON: 98.87%
	PROPN: 96.51%
	PUNCT: 99.95%
	SCONJ: 94.61%
	SYM: 25.00%
	VERB: 98.66%
	X: 93.53%



In [11]:
get_measures(test_xpos, test_xpos_annotations)

MEASURES:
Accuracy: 94.29%
Precision (weighted): 94.25%
Recall (weighted): 94.29%
F1 (weighted): 94.09%
Matthew's Correlation Coefficient: 94.05%


### EXECUTION - HISTORICAL

In [12]:
tokens_upos, _ = extract_conllu_data(file_upos, 'upos', sentences=True, combined=True)
tokens_xpos, _ = extract_conllu_data(file_xpos, 'xpos', sentences=True, combined=True)
tokens_lemmas, _ = extract_conllu_data(file_xpos, 'lemma', sentences=True, combined=True)

tokens_10k, upos = make_tagger_friendly(tokens_upos)
tokens_3k, xpos = make_tagger_friendly(tokens_xpos)
_, lemmas = make_tagger_friendly(tokens_lemmas)

In [13]:
lemma_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,lemma', 'lemma')

2023-03-29 22:55:27 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| lemma     | pdb     |

2023-03-29 22:55:27 INFO: Use device: cpu
2023-03-29 22:55:27 INFO: Loading: tokenize
2023-03-29 22:55:27 INFO: Loading: mwt
2023-03-29 22:55:27 INFO: Loading: lemma
2023-03-29 22:55:27 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 115/115 [00:05<00:00, 20.38it/s]


In [14]:
upos_annotations = get_stanza_anns(tokens_10k, 'tokenize,mwt,pos', 'upos')

2023-03-29 22:55:33 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-29 22:55:33 INFO: Use device: cpu
2023-03-29 22:55:33 INFO: Loading: tokenize
2023-03-29 22:55:33 INFO: Loading: mwt
2023-03-29 22:55:33 INFO: Loading: pos
2023-03-29 22:55:34 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 360/360 [00:40<00:00,  8.81it/s]


In [15]:
xpos_annotations = get_stanza_anns(tokens_3k, 'tokenize,mwt,pos', 'xpos')

2023-03-29 22:56:14 INFO: Loading these models for language: pl (Polish):
| Processor | Package |
-----------------------
| tokenize  | pdb     |
| mwt       | pdb     |
| pos       | pdb     |

2023-03-29 22:56:14 INFO: Use device: cpu
2023-03-29 22:56:14 INFO: Loading: tokenize
2023-03-29 22:56:14 INFO: Loading: mwt
2023-03-29 22:56:14 INFO: Loading: pos
2023-03-29 22:56:15 INFO: Done loading processors!
Retrieving annotations per sentence...: 100%|█████████████████████████████████████████| 115/115 [00:12<00:00,  9.12it/s]


In [16]:
get_lemma_measures(lemmas, lemma_annotations)

Accuracy: 83.43%


In [17]:
full_lemmas = get_full_table(lemmas, lemma_annotations, tokens_3k)
full_lemmas.to_excel('../data/results/stanza_lemmas.xlsx')

In [18]:
comparison = get_lemma_comparison(lemmas, lemma_annotations, tokens_3k)
comparison.to_excel('../data/mistakes/stanza_lemma_mistakes.xlsx')

In [19]:
comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Godziszewo,wsi Godziszewo parafii,Godziszewo,godziszewo
1,Rypnin,parafii Rypnin syn,Rypnin,rypnin
2,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,dobrzyńska,dobrrzyńska
3,ś,bratem ś .,świętej,być
4,p,. p .,pamięci,pan
5,Melchiora,. Melchiora –,Melchior,melchior
6,wdokumentach,później wdokumentach się,dokument,wdokument
7,Ewy,matki Ewy z,Ewa,ew
8,Pinińskich,z Pinińskich właścicieli,Piniński,piniński
9,Dóbr,właścicieli Dóbr Strużewo,dobra,dzbr


In [20]:
get_measures(upos, upos_annotations, details=True)

MEASURES:
Accuracy: 93.09%
Precision (weighted): 93.31%
Recall (weighted): 93.09%
F1 (weighted): 93.08%
Matthew's Correlation Coefficient: 92.17%

MEASURES PER CLASS:
Precision:
	ADJ: 88.56%
	ADP: 99.49%
	ADV: 90.85%
	AUX: 82.46%
	CCONJ: 98.14%
	DET: 93.66%
	NOUN: 95.15%
	NUM: 97.25%
	PART: 93.33%
	PRON: 90.08%
	PROPN: 78.34%
	PUNCT: 99.59%
	SCONJ: 85.84%
	VERB: 93.56%
	X: 73.58%
Recall:
	ADJ: 93.73%
	ADP: 98.99%
	ADV: 87.91%
	AUX: 85.99%
	CCONJ: 97.24%
	DET: 79.63%
	NOUN: 93.17%
	NUM: 79.70%
	PART: 74.04%
	PRON: 91.50%
	PROPN: 91.56%
	PUNCT: 100.00%
	SCONJ: 94.95%
	VERB: 93.72%
	X: 56.52%



In [21]:
full_upos = get_full_table(upos, upos_annotations, tokens_10k)
full_upos.to_excel('../data/results/stanza_upos.xlsx')

In [22]:
upos_comparison = get_comparison(upos, upos_annotations, tokens_10k)
upos_comparison.to_excel('../data/mistakes/stanza_UPOS_mistakes.xlsx')

In [23]:
upos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Komornika,Jana Komornika ziemi,NOUN,PROPN
1,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,ADJ,PROPN
2,ś,bratem ś .,X,AUX
3,Pinińskich,z Pinińskich właścicieli,PROPN,ADJ
4,Dąbrowy,", Dąbrowy części",PROPN,ADJ
5,śp,że śp Dziad,X,PROPN
6,Dziad,śp Dziad mój,NOUN,PROPN
7,Dobrzyńskiej,ziemi Dobrzyńskiej za,PROPN,ADJ
8,Byli,. Byli jeszcze,VERB,AUX
9,Panna,siostra Panna Urszula,NOUN,PROPN


In [24]:
get_measures(xpos, xpos_annotations)

MEASURES:
Accuracy: 87.62%
Precision (weighted): 88.36%
Recall (weighted): 87.62%
F1 (weighted): 87.40%
Matthew's Correlation Coefficient: 87.22%


In [25]:
full_xpos = get_full_table(xpos, xpos_annotations, tokens_3k)
full_xpos.to_excel('../data/results/stanza_xpos.xlsx')

In [26]:
xpos_comparison = get_comparison(xpos, xpos_annotations, tokens_3k)
xpos_comparison.to_excel('../data/mistakes/stanza_XPOS_mistakes.xlsx')

In [27]:
xpos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Godziszewo,wsi Godziszewo parafii,subst:sg:nom:n:ncol,subst:sg:nom:m1
1,parafii,Godziszewo parafii Rypnin,subst:sg:loc:f,subst:sg:gen:f
2,Rypnin,parafii Rypnin syn,subst:sg:nom:m3,subst:sg:nom:m1
3,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,adj:sg:gen:f:pos,subst:sg:gen:f
4,ś,bratem ś .,brev:pun,aglt:sg:sec:imperf:nwok
5,p,. p .,brev:npun,brev:pun
6,Pinińskich,z Pinińskich właścicieli,subst:pl:gen:m1,adj:pl:gen:m1:pos
7,Strużewo,Dóbr Strużewo z,subst:sg:nom:n:ncol,subst:sg:gen:n
8,Dąbrowy,", Dąbrowy części",subst:sg:gen:f,adj:sg:gen:f:pos
9,śp,że śp Dziad,brev:npun,subst:sg:nom:m1
