In [36]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import pyconll

UPOS_TAGSET = { "ADJ": "OPEN",	            	    
                "ADV": "OPEN",	            	    
                "INTJ": "OPEN",            	    
                "NOUN": "OPEN",            	 
                "PROPN": "OPEN",	        	 
                "VERB": "OPEN",            	 
                "ADP": "CLOSED",
                "AUX": "CLOSED",
                "CCONJ": "CLOSED",
                "DET": "CLOSED",
                "NUM": "CLOSED",
                "PART": "CLOSED",
                "PRON": "CLOSED",
                "SCONJ": "CLOSED",	      
                "PUNCT": "OTHER",
                "SYM": "OTHER",
                "X": "OTHER"}

%load_ext autoreload
%autoreload

In [64]:
# Greek
TRAIN_FILE = Path("data/UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu")  
DEV_FILE =  Path("data/UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu")  
TEST_FILE = Path("data/UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu")

# Latin
TRAIN_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-train.conllu")  
DEV_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-dev.conllu")  
TEST_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-test.conllu")  
  
import src.utils as utils

train_tokens, train_tags = utils.preprocess_data(pyconll.load_from_file(TRAIN_FILE))
dev_tokens, dev_tags = utils.preprocess_data(pyconll.load_from_file(DEV_FILE))
test_tokens, test_tags = utils.preprocess_data(pyconll.load_from_file(TEST_FILE))

dataset_tokens = train_tokens + dev_tokens + test_tokens
dataset_tags = train_tags +  dev_tags + test_tags

## EDA

In [65]:
import numpy as np

n_sentences = len(dataset_tokens)
ntokens_per_sent_distribution = Counter([len(sent) for sent in dataset_tokens])
pos_distribution = Counter([tag for tag_seq in dataset_tags for tag in tag_seq])
token_distribution = Counter([token for sent in dataset_tokens for token in sent])

sentences_lengths = np.array(list(ntokens_per_sent_distribution.keys()))
sentences_lengths_freqs = np.array(list(ntokens_per_sent_distribution.values()))

sorted_idx = np.argsort(sentences_lengths)
sentences_lengths = sentences_lengths[sorted_idx]
sentences_lengths_freqs = sentences_lengths_freqs [sorted_idx]

In [66]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig =  make_subplots(rows=2, cols=3, 
specs=[[{"colspan":3},None,None],[{"colspan":1},{},{}]])

fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="Overall"), row=1, col=1)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 50"), row=2, col=1)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 150"), row=2, col=2)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length > 150"), row=2, col=3)

idx_mode = np.argmax(sentences_lengths_freqs)
mode = sentences_lengths[idx_mode]
mode_value = sentences_lengths_freqs[idx_mode]

fig.add_annotation(x=mode, y=mode_value,
            text="Most common<br>sentence length",
            showarrow=True,
            arrowhead=2, row=1, col=1)
            
most_long_sentence_idx = np.argmax(sentences_lengths)
most_long_sentence = sentences_lengths[most_long_sentence_idx]

fig.add_annotation(x=most_long_sentence, y=1,
            text="Longest <br>sentence",
            showarrow=True,
            arrowhead=2, row=1, col=1)


fig.update_xaxes(range=[0,50], row=2, col=1)
fig.update_xaxes(range=[51,150], row=2, col=2)
fig.update_xaxes(range=[151,370], row=2, col=3)

fig.update_yaxes(range=[0,1200], row=2, col=1)
fig.update_yaxes(range=[0,10], row=2, col=2)
fig.update_yaxes(range=[0,5], row=2, col=3)

fig.update_yaxes(title="# of scentences", row=1, col=1)
fig.update_xaxes(title="# of tokens in the sentence", row=1, col=1)

fig.update_layout(title="How long are the sentences?", width=1200, height=600)
fig.show()

In [67]:
names = ["POS", "OPEN","CLOSED","OTHER"] + list(pos_distribution)
parents = ["", "POS","POS","POS"] + [UPOS_TAGSET[pos] for pos in pos_distribution]

root_total = sum(pos_distribution.values())
open_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "OPEN"])
closed_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "CLOSED"])
other_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "OTHER"])

values = [root_total,open_total,closed_total,other_total] + list(pos_distribution.values())

fig = go.Figure(go.Treemap(
        labels=names,
        parents=parents,
        values=values, branchvalues="total",
        textinfo="label+percent root+percent parent"))

fig.update_layout(width=1000,height=500, margin_t=50, margin_b=50, margin_l=15, margin_r=15, title="How tokens are distributed in POS classes?")

## Evaluation

* Multiple smoothing techniques on dev set (model selection)
    * comparison table (rows: smoothing, columns: dataset (latin vs greek))
* Accuracy Comparison random, majority, viterbi with best smoothing, memm?
* best model from above --> confusion matrix with analysis of most common errors

In [1]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

def evaluate(model, dataset_tokens, dataset_tags, labels):
    all_predictions = []
    all_true_tags = []

    for sentence, true_tags in zip(dataset_tokens[:2500], dataset_tags[:2500]):
        predicted_tags = model.predict(sentence)

        for (token, predicted), true in zip(predicted_tags, true_tags):
            #print(token, predicted, true)
            all_predictions.append(predicted)
            all_true_tags.append(true)

    accuracy = accuracy_score(all_true_tags, all_predictions)
    confusion_mat = confusion_matrix(all_true_tags, all_predictions, labels=labels)

    return accuracy, confusion_mat

In [1]:
from src.pos_tagging import HMMPosTagger
from collections import defaultdict, Counter
from src.utils import preprocess_data

from pathlib import Path
import pyconll

TRAIN_LATIN_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-train.conllu")  
DEV_LATIN_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-dev.conllu")  
TEST_LATIN_FILE = Path("data/UD_Latin-LLCT/la_llct-ud-test.conllu")  

# @TODO add option to load un-lemamtized data (for rule based tagger, sicne this could be a problem)
train_tokens, train_tags = preprocess_data(pyconll.load_from_file(TRAIN_LATIN_FILE))
dev_tokens, dev_tags = preprocess_data(pyconll.load_from_file(DEV_LATIN_FILE))
test_tokens, test_tags = preprocess_data(pyconll.load_from_file(TEST_LATIN_FILE))


In [7]:
import src.smoothing as sm
import src.utils as utils


pos_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN',
 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']

latin_derivational_suffixes = utils.load_pattern_rules(Path("data/latin_derivational_suffixes_rules.txt"))

smoothers = [("default", None),
             ("noun", sm.NounSmoother()),
             ("noun_verb", sm.NounVerbSmoother()),
             ("uniform", sm.UniformSmoother(pos_tags)),
             ("rule_based", sm.RuleBasedSmoother(latin_derivational_suffixes))]

stats = {}

for smooth_config in smoothers:
    print(f"evaluating {smooth_config[0]}")
    
    tagger = HMMPosTagger(smoother=smooth_config[1])
    tagger.fit(train_tokens, train_tags)

    acc, conf_mat = evaluate(tagger, dev_tokens, dev_tags, labels=pos_tags)

    stats[smooth_config[0]] = {
        "accuracy": acc,
        "confusion_matrix": conf_mat
    }

evaluating default


KeyboardInterrupt: 

In [91]:
for run in stats:
    print(run, f'{stats[run]["accuracy"]:.3f}')

default 0.950
noun 0.969
noun_verb 0.969
uniform 0.971
rule_based 0.855


##### How many tokens od dev set are not included in the training? A proxy measure to asses how much influence have the smoothing mechanism

In [75]:
dev_set_vocab = set(tok for sent in test_tokens for tok in sent)

diff = dev_set_vocab.difference(tagger.emission_probs_smoother.known_tokens)
print(f"There are {len(diff)} unknown words in dev set")

There are 135 unknown words in dev set


AttributeError: 'NoneType' object has no attribute 'matrix'

### Test taggers

In [20]:
import src.pos_tagging as tagging
import src.smoothing as sm

TAGSET = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN',
 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']

models = [("random_baseline", tagging.DummyRandomTagger()),
          ("majority", tagging.DummyMajorityTagger()),
          ("HMM+Viterbi", tagging.HMMPosTagger()),
          ("HMM+Viterbi+UniformSmoothing", tagging.HMMPosTagger(sm.UniformSmoother(TAGSET)))]

stats = {}

for config in models:
    print(f"evaluating {config[0]}")
    
    tagger = config[1]
    tagger.fit(train_tokens, train_tags)
    acc, conf_mat = evaluate(tagger, test_tokens + dev_tokens, test_tags + dev_tags, labels=TAGSET)

    stats[config[0]] = {
        "accuracy": acc,
        "confusion_matrix": conf_mat
    }


evaluating random_baseline
evaluating majority
evaluating HMM+Viterbi
evaluating HMM+Viterbi+UniformSmoothing


In [21]:
for run in stats:
    print(run, f'{stats[run]["accuracy"]:.3f}')

random_baseline 0.066
majority 0.969
HMM+Viterbi 0.934
HMM+Viterbi+UniformSmoothing 0.968


## Smoothing

In [12]:
import src.smoothing as sm
import src.pos_tagging as pt

tagger = pt.HMMPosTagger()
tagger.fit(train_tokens, train_tags)

ds = sm.BaseSmoother(tagger.emission_probs)
print(ds.get("ADP","secundum"))
print(ds.get("ADPJ","innuendo"))



0.0058486109548982115
1e-16


In [None]:
ns = sm.NounSmoother(tagger.emission_probs)
print(ns.get("ADP","secundum"))
print(ns.get("VERB","innuendo"))
print(ns.get("NOUN","innuendo"))

In [None]:
nvs = sm.NounVerbSmoother(tagger.emission_probs)

print(nvs.get("ADP","secundum"))
print(nvs.get("PUNCT","innuendo"))
print(nvs.get("VERB","innuendo"))
print(nvs.get("NOUN","innuendo"))

In [None]:
us = sm.UniformSmoother(tagger.emission_probs, ["NOUN", "VERB", "ADJ"])
print(us.get("ADP","secundum"))
print(us.get("NOUN","innuendo"))
print(us.get("VERB","innuendo"))
print(us.get("ADJ","innuendo"))

In [None]:
# for latin reference here https://en.wiktionary.org/wiki/Category:Latin_derivational_suffixes
patterns = [
    (r'.*fico$', 'VERB'),             
    (r'.*ico$', 'VERB'),               
    (r'.*idio$', 'VERB'),               
    (r'.*isco$', 'VERB'),               
    (r'.*illo', 'VERB'),               
    (r'.*urio$', 'VERB'),
    
    (r'.*agium$', 'VERB'),
    (r'.*arium$', 'VERB'),
    (r'.*astrum$', 'VERB'),
    (r'.*$cidium', 'VERB'),
    (r'.*$edo', 'VERB'),
    (r'.*$fex', 'VERB'),
    (r'.*$ismus', 'VERB'),
]

rbs = sm.RuleBasedSmoother(patterns, probs_dict=tagger.emission_probs)
print(us.get("ADP","secundum"))
print(rbs.get("VERB", "antico"))
print(rbs.get("VERB", "aseo"))
print(rbs.get("NOUN", "antema"))