In [139]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px

from collections import Counter
import pyconll

UPOS_TAGSET = { "ADJ": "OPEN",	            	    
                "ADV": "OPEN",	            	    
                "INTJ": "OPEN",            	    
                "NOUN": "OPEN",            	 
                "PROPN": "OPEN",	        	 
                "VERB": "OPEN",            	 
                "ADP": "CLOSED",
                "AUX": "CLOSED",
                "CCONJ": "CLOSED",
                "DET": "CLOSED",
                "NUM": "CLOSED",
                "PART": "CLOSED",
                "PRON": "CLOSED",
                "SCONJ": "CLOSED",	      
                "PUNCT": "OTHER",
                "SYM": "OTHER",
                "X": "OTHER"}

In [140]:
LATIN_DIR = Path("data/UD_Latin-LLCT")
train_path = LATIN_DIR / "la_llct-ud-train.conllu"

train = pyconll.load_from_file(train_path)

In [121]:
n_sentences = len(train)
ntokens_per_sent_distribution = Counter([len(sent) for sent in train])
pos_distribution = Counter([token.upos for sent in train for token in sent])
token_distribution = Counter([token.lemma for sent in train for token in sent])

## Sentences

In [235]:
sentences_lengths = np.array(list(ntokens_per_sent_distribution.keys()))
sentences_lengths_freqs = np.array(list(ntokens_per_sent_distribution.values()))

sorted_idx = np.argsort(sentences_lengths)
sentences_lengths = sentences_lengths[sorted_idx]
sentences_lengths_freqs = sentences_lengths_freqs [sorted_idx]



In [274]:
from plotly.subplots import make_subplots
fig =  make_subplots(rows=2, cols=3, 
specs=[[{"colspan":3},None,None],[{"colspan":1},{},{}]])

fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="Overall"), row=1, col=1)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 50"), row=2, col=1)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 150"), row=2, col=2)
fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length > 150"), row=2, col=3)

idx_mode = np.argmax(sentences_lengths_freqs)
mode = sentences_lengths[idx_mode]
mode_value = sentences_lengths_freqs[idx_mode]

fig.add_annotation(x=mode, y=mode_value,
            text="Most common<br>sentence length",
            showarrow=True,
            arrowhead=2, row=1, col=1)
            
most_long_sentence_idx = np.argmax(sentences_lengths)
most_long_sentence = sentences_lengths[most_long_sentence_idx]

fig.add_annotation(x=most_long_sentence, y=1,
            text="Longest <br>sentence",
            showarrow=True,
            arrowhead=2, row=1, col=1)


fig.update_xaxes(range=[0,50], row=2, col=1)
fig.update_xaxes(range=[51,150], row=2, col=2)
fig.update_xaxes(range=[151,370], row=2, col=3)

fig.update_yaxes(range=[0,800], row=2, col=1)
fig.update_yaxes(range=[0,100], row=2, col=2)
fig.update_yaxes(range=[0,5], row=2, col=3)

fig.update_yaxes(title="# of scentences", row=1, col=1)
fig.update_xaxes(title="# of tokens in the sentence", row=1, col=1)

fig.update_layout(title="How long are the sentences?", width=1200, height=600)
fig.show()

##  Tokens

In [163]:
names = ["POS", "OPEN","CLOSED","OTHER"] + list(pos_distribution)
parents = ["", "POS","POS","POS"] + [UPOS_TAGSET[pos] for pos in pos_distribution]

root_total = sum(pos_distribution.values())
open_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "OPEN"])
closed_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "CLOSED"])
other_total = sum([pos_distribution[pos] for pos in pos_distribution if UPOS_TAGSET[pos] == "OTHER"])

values = [root_total,open_total,closed_total,other_total] + list(pos_distribution.values())

fig = go.Figure(go.Treemap(
        labels=names,
        parents=parents,
        values=values, branchvalues="total",
        textinfo="label+percent root+percent parent"))

fig.update_layout(width=1000,height=500, title="How tokens are distributed in their POS classes?")