In [51]:
from typing import List, Dict
from collections import Counter, defaultdict


class NewLemma:
    def __init__(self, lemma, pos, code=None, indecl: bool=None, info: str = None, text: str = None):
        self.lemma = lemma
        self.pos = pos
        self.code = code
        self.indecl = indecl
        self.info = info
        self.text = text
        
        
            

## Données du LASLA

In [52]:
# read the dictionnary
LASLA = {}
data_no_code = []

default_varia_pos = defaultdict(dict)

with open("dictionnary.tsv") as f:
    header: List[str] = []
    for line_no, line in enumerate(f):
        if line_no == 0:
            header = line.strip().split("\t")
            continue
        tokens: Dict[str, str] = dict(zip(header, line.strip().split("\t")))
        LASLA[tokens["Lemma"]] = (tokens["Nature"], tokens.get("Code", None))
        if tokens["Lemma"][-1].isnumeric():
            default_varia_pos[tokens["Lemma"][:-1]][tokens["Nature"]] = tokens["Lemma"]
        if tokens.get("Code", None) is None:
            data_no_code.append(tokens["Lemma"])


---

## Tableaux d'alignement

In [53]:
# read the dictionnary
Alignment = {}

with open("ProielToLasla.tsv") as f:
    header: List[str] = []
    for line_no, line in enumerate(f):
        if line_no == 0:
            header = line.strip().split("\t")
            continue
        tokens: Dict[str, str] = dict(zip(header, line.strip().split("\t")))
        Alignment[tokens["Lemma in PROIEL"]] = NewLemma(
            lemma=tokens["Lemma in Forcellini"],
            pos=tokens["POS"],
            code=tokens["Code (11 = a, ae; 12 = us, 21: a, us, i)"],
            info=tokens["Info"],
            indecl="Indecl" in tokens["Info"],
            text=tokens["Texte"]
        )
        
# read the dictionnary
with open("ProielToLaslaNouns.tsv") as f:
    header: List[str] = []
    for line_no, line in enumerate(f):
        if line_no == 0:
            header = line.strip().split("\t")
            continue
        tokens: Dict[str, str] = dict(zip(header, line.strip().split("\t")))
        Alignment[tokens["PROIEL"]] = NewLemma(
            lemma=tokens["LASLA"],
            pos=tokens["POS"],
            code=tokens["Code"],
            info=tokens["Info"],
            indecl="Indecl" in tokens["Info"],
            text=tokens["Texte"]
        )

## Full dictionary

---

## Conversion des POS

In [54]:
def convert_pos(line):
    pos = line["pos2"]
    if pos == "Ne":
        pos = "NOMpro"
    elif pos == "Nb":
        pos = "NOMcom"
    elif pos == "A-":
        pos = "ADJqua"
    elif pos == "V-":
        pos = "VER"
    elif pos == "D-":
        pos = "ADV"
    elif pos == "I-":
        pos = "INJ"
    elif pos == "Du":
        pos = "ADVint"
    elif pos == "G-":
        pos = "CONsub"
    elif pos == "Dq":
        pos = "ADVrel"
    elif pos == "C-":
        pos = "CONcoo"
    elif pos == "R-":
        pos = "PRE"
    elif pos == "R-":
        pos = "PRE"
    elif pos == "Df":
        pos = "ADV"
    elif pos == "Mo":
        pos = "ADJord"
    elif pos == "Ma":
        pos = "ADJcar"
    elif pos == "F-":
        pos = "OUT"
    elif pos == "Px":
        pos = "PROind"
    elif pos == "Pr":
        pos = "PROrel"
    elif pos == "Pp":
        pos = "PROper"
    line["pos2"] = pos

## Conversion Lemmes Simples

In [55]:
direct_equiv = {
    "possum": "possum1",
    "exeo": "exeo1",
    "abeo": "abeo1",
    "se": ("sui1", "PROref"),
    "Iohannes": "Ioannes", # Clearly poor choice of original lemma from Forcellini..
    "diligo": "diligo3",
    "uado": "uado2", # Vu seulement des vadis, it ere
    "coepi": "coepio",
    # Disambiguated
    "Apollo#2": "Appolo",
    "uolo#2": "uolo2",
    "uolo#1": "uolo3",
    "os#1": "os1",
    "os#2": "os2",
    "frons#1": "frons2",
    "frons#2": "frons1",
    "educo#1": "educo2",
    "educo#2": "educo1",
    "quam#1": "quam1",
    "quam#2": "quam2",
    "consterno#2": "consterno1", 
    "colligo#1": "colligo3",
    "edo#1": "edo1",
    "insero#2": "insero2",
    "insero#1": "insero3",
    "fundo#1": "fundo2",
    "fundo#2": "fundo1",
    "quam#3": "quam1",
    "eo#2": "eo3", # ADV
    "ut#1": "ut1",  # Circonspect sur ce lemme...
    "ut#2": "ut1", 
    "quod#1": 'quod1',
    "quod#2": 'quod2',
    "quattuor": ("quatuor", "ADJcar"),
    "quattuordecim": "quatuordecim",
    "secundus2": ("secundus", "ADJord"),
    "alioquin": "alioqui",
    "Appolo": "Apolo",
    
    # Dérivés de dico
    "praedico#1": "praedico1",
    "praedico#2": "praedico2",
    "indico#1": "indico1",
    "indico#2": "indico2",
    
    # Dérivés de caedo et cado
    # Vraiment pas sûr pour certains..
    "occido#1": "occido2",
    "occido#2": "occido1",
    "incido#1": "incido1",
    "incido#2": "incido2",
    "excaedo": "excido2",
    "excido#1": "excido1",
    "excido#2": "excido2",
    "concido#2": "concido2",
    "concido#1": "concido1",
    
    # Différence de sens
    "comparo#2": "comparo2",
    "caelum": "caelum1",# On part du principe que c'est le ciel et pas des ciseaux
    "fides": "fides2", # On part du principe que c'est la fidélité et pas une lyre
    "tempus": "tempus1",  # Probablement pas la tempe
    "populus": "populus1", # Probablement pas le peuplier
}

straight = {"spargo": "2", "transeo": "1", "commodo": "1", "paro": "2", "pereo": "1", "capio": "2", "possum": "1", "aereus": "2", "subeo": "1", "incitatus": "2", "magis": "2", "congero": "2", "incubo": "2", "succedo": "1", "seligo": "3", "sinistra": "1", "edico": "2", "uado": "2", "recito": "1", "exedo": "2", "praecido": "2", "equitatus": "1", "moror": "1", "desum": "1", "obsum": "1", "pallas": "o", "retro": "1", "rationalis": "2", "semen": "1", "obligo": "2", "recedo": "1", "concedo": "1", "praesum": "1", "annitor": "2", "fabula": "1", "munio": "2", "incito": "1", "maledico": "2", "procedo": "1", "accedo": "1", "pressus": "2", "insum": "1", "prosum": "1", "exeo": "1", "inerro": "2", "absum": "1", "ineo": "1", "redeo": "1", "intereo": "1", "ascio": "2", "excito": "1", "delabor": "2", "diligo": "3", "incolo": "2", "attonitus": "2", "relabor": "2", "euerbero": "2", "profundo": "2", "bibo": "2", "delicia": "1", "offundo": "2", "occupo": "2", "circumeo": "1", "subleuo": "1", "annumero": "1", "prouolo": "2", "discedo": "1", "perturbo": "2", "commuto": "2", "secedo": "1", "remoror": "1", "eleuo": "1", "colo": "2", "abeo": "1", "concito": "1", "inuado": "2", "uitupero": "2", "commoror": "1", "excedo": "1", "elabor": "2", "eligo": "3", "euolo": "2", "milito": "2", "commilito": "1", "tremulus": "3", "praetereo": "1", "lustro": "2", "separo": "2", "cauo": "1", "diffundo": "2", "campus": "1", "crebro": "2", "perbibo": "2", "expilo": "2", "betaceus": "1", "reformido": "2", "natio": "1", "mentio": "1", "recolligo": "3", "trans": "2", "insector": "2", "carbonarius": "1", "iliades": "o", "perfundo": "2", "transuerbero": "2", "decedo": "1", "uillicus": "1", "circumfundo": "2", "ambo": "2", "praecedo": "1", "effundo": "2", "obscurum": "1", "alligo": "2", "superficiarius": "2", "olympia": "o", "quinqueremis": "1", "supersum": "1", "tripus": "1", "fidelis": "2", "releuo": "1", "zonarius": "1", "incedo": "1", "enumero": "1", "diuerbero": "2", "alleuo": "1", "superfundo": "2", "conditor": "1", "triumphalis": "2", "castus": "2", "manifesto": "2", "insitus": "2", "leno": "1", "intersum": "1", "coeo": "1", "lenis": "2", "falso": "2", "partus": "1", "minutus": "2", "nequam": "2", "commentarius": "1", "praeeo": "1", "perlabor": "2", "solidus": "2", "reparo": "2", "sponsus": "1", "perlego": "2", "auspicatus": "2", "confundo": "2", "odoratus": "2", "perfectus": "2", "afflictus": "2", "fluxus": "2", "quadrupes": "1", "infundo": "2", "addico": "2", "abscedo": "1", "exsuscito": "1", "deturbo": "2", "disturbo": "2", "obeo": "1", "gregarius": "2", "aduolo": "2", "intercedo": "1", "innitor": "2", "uireo": "2", "pando": "2", "antecapio": "2", "pario": "2", "antecedo": "1", "recolo": "2", "desidia": "1", "enitor": "2", "prodeo": "1", "uitio": "1", "trimeter": "1", "recessus": "1", "demeto": "2", "candidatus": "1", "oberro": "2", "exsero": "2", "ablego": "1", "pactus": "2", "demoror": "1", "resono": "1", "lene": "2", "suffundo": "2", "denarius": "1", "resero": "1", "gestio": "2", "apparo": "2", "immuto": "2", "proturbo": "2", "esurio": "2", "decerto": "1", "abscido": "2", "compilo": "1", "caro": "1", "circumuado": "2", "illigo": "2", "instructus": "2", "hospitalis": "2", "affundo": "2", "adiutor": "1", "delego": "1", "deuersor": "2", "tripes": "2", "anteeo": "1", "furio": "1", "dedico": "1", "excidium": "1", "ascitus": "2", "barbatus": "2", "longinquum": "1", "attentus": "2", "philosophus": "1", "armentarius": "1", "sublabor": "2", "dilabor": "2", "defundo": "2", "temperatus": "2", "connitor": "2", "uenus": "1", "pererro": "2", "expressus": "2", "comedo": "2", "insciens": "1", "exactus": "2", "enixus": "2", "euado": "2", "scitus": "2", "illustro": "2", "tonsa": "1", "prolabor": "2", "adedo": "2", "repens": "1", "permuto": "2", "exturbo": "2", "praeparo": "2", "neglectus": "2", "combibo": "2", "contradico": "2", "introeo": "1", "deerro": "2", "conturbo": "2", "cubicularius": "1", "polio": "2", "separatus": "2", "quadriremis": "1", "platea": "1", "tritus": "2", "interdico": "2", "mathematicus": "1", "opertus": "2", "peruado": "2", "collabor": "2", "cognitus": "2", "editus": "2", "dispereo": "1", "circumligo": "2", "sonipes": "1", "peruulgo": "1", "ficus": "1", "concretus": "2", "accolo": "2", "conspiro": "1", "partio": "2", "subnitor": "2", "suscito": "1", "superfluo": "1", "mutuo": "2", "ultimo": "2", "inconsultus": "2", "depereo": "1", "subtus": "1", "immunio": "2", "exsatio": "2", "praes": "1", "anfractus": "1", "religo": "2", "lixa": "1", "elatro": "2", "laeuum": "1", "conuolo": "2", "consono": "1", "quotquot": "1", "denuo": "2", "inflatus": "2", "oscillum": "1", "obnitor": "2", "pipio": "2", "anas": "1", "pernumero": "1", "transuolo": "2", "excolo": "2", "pecuniarius": "2", "dispando": "2", "exporrigo": "2", "solidum": "1", "damnatus": "2", "ueternus": "1", "emunio": "2", "singula": "2", "imbibo": "2", "ablaqueo": "1", "inturbo": "2", "pransus": "2", "praeterlabor": "2", "instinctus": "1", "quadratus": "2", "perpolio": "2", "subsum": "1", "uocalis": "2", "benedico": "2", "euagor": "2", "scelero": "2", "liquidum": "1", "praepositus": "1", "ubinam": "2", "tersus": "2", "punctus": "2", "aberro": "2", "manduco": "2", "inquies": "2", "cellarius": "1", "oceanus": "1", "peripateticus": "1", "infectus": "2", "perlustro": "2", "assono": "1", "persono": "1", "renumero": "1", "adusque": "2", "permissus": "1", "dinumero": "1", "simile": "2", "consector": "2", "obturbo": "2", "circumuolo": "2", "catillo": "2", "stagnum": "1", "obsono": "1", "circummunio": "2", "transfundo": "2", "abdico": "1", "refundo": "2", "edissero": "2", "illabor": "2", "luto": "1", "uiride": "1", "interior": "2", "peruagor": "2", "cernuus": "2", "canalis": "1", "insono": "1", "tertianus": "1", "abortio": "1", "allabor": "2", "famelicus": "2", "accensus": "1", "ambeo": "1", "stagno": "1", "bardus": "2", "panthera": "1", "interfundo": "2", "mordicus": "2", "expando": "2", "compingo": "1", "suspecto": "1", "praemunio": "2", "arcitenens": "2", "cachinno": "2", "occedo": "1", "raedarius": "1", "inuolo": "2", "expolio": "2", "amando": "2", "prima": "1", "condico": "2", "quousque": "2", "incommodo": "1", "unio": "1", "conclusus": "2", "unctus": "2", "procido": "1", "tabernarius": "1", "disparo": "2", "auolo": "2", "spoliatus": "1", "ebibo": "2", "ingenitus": "1", "largitor": "1", "saliaris": "1", "uitrarius": "1", "peditatus": "1", "semideus": "2", "retrocedo": "1", "organicus": "1", "nummularius": "1", "uicia": "1", "praesono": "1", "fundator": "1", "siligineus": "2", "praeoccupo": "2", "clauiger": "2", "permunio": "2", "laqueo": "1", "diuulgo": "1", "compressus": "1", "praegestio": "2", "inuersus": "1", "pullarius": "1", "centonarius": "1", "absque": "2", "persector": "2", "exsono": "1", "conuerbero": "2", "circumsono": "1", "adbibo": "2", "circito": "1", "assector": "2", "reuolo": "2", "lymphatus": "2", "implecto": "2", "memoratus": "1", "collustro": "2", "arcatus": "1", "sagittifer": "2", "confinis": "2", "prosero": "2", "deuolo": "2", "pragmaticus": "1", "potus": "2", "iuridicus": "1", "suffoco": "1", "denumero": "1", "desultorius": "1", "demuto": "2", "breue": "1", "notarius": "1", "conterminus": "2", "iudicatus": "1", "lymphaticus": "2", "prouectus": "2", "quartum": "2", "extentus": "2", "quaesitus": "2", "nauicularius": "1", "praelego": "2", "pilo": "1", "transero": "2", "agitatus": "2", "curatus": "2", "interbibo": "2", "poetica": "2", "obstantia": "2", "riualis": "1", "uicesimarius": "1", "inclinatus": "2", "gerulus": "1", "sublimus": "1", "tabularius": "1", "euulgo": "1", "myrrhina": "2", "superuado": "2", "exaggero": "1", "constitutus": "2", "reporrigo": "2", "classicus": "2", "restagno": "1", "palpo": "2", "praeteruolo": "2", "comitialis": "2", "elate": "2", "incendiarius": "1", "electo": "1", "pessum": "2", "conflictor": "2", "demando": "2", "interlego": "2", "laticlauius": "2", "criticus": "1", "praelabor": "2", "uarus": "2", "pedo": "3", "suppilo": "2", "frigo": "1", "ferraria": "1", "excauo": "1", "speratus": "2", "praeuolo": "2", "thesaurarius": "2", "pythia": "o", "antequam": "1", "cupula": "2", "assiduo": "2", "fatuor": "1", "superuolo": "2", "prodico": "2", "triarius": "1", "solarium": "2", "marmorarius": "1", "subterlabor": "2", "trieris": "2", "praespargo": "2", "commeto": "1", "agens": "1", "praepostero": "2", "subedo": "2", "tesserarius": "1", "depilo": "1", "interlabor": "2", "megara": "o", "transmuto": "2", "anno": "1", "striga": "2", "cinerarius": "1", "palmaris": "2", "grammatica": "2", "illaqueo": "2", "uenalicius": "1", "procinctus": "1", "uicanus": "1", "ambedo": "2", "contente": "1", "araneus": "1", "hiera": "2", "uillico": "2", "praecolo": "2", "aurarius": "2", "inuocatus": "2", "directum": "1", "nutricius": "1", "pedatus": "1", "christianus": "1", "elego": "1", "inceptus": "1", "arcessitus": "2", "percido": "2", "translego": "2", "immoror": "1", "lanarius": "1", "exspargo": "2", "sartor": "2", "praeligo": "2", "sublego": "2", "inaccedo": "1", "pediculus": "1", "reuerbero": "2", "discido": "2", "emeto": "2", "incerto": "1", "solo": "1", "scaeua": "2", "paedico": "2", "insaeptus": "1", "deductus": "2", "epulo": "1", "sortilegus": "2", "furunculus": "1", "diuisus": "2", "bustuarius": "2", "concerto": "1", "gibber": "2", "illex": "1", "adaggero": "1", "mensarius": "1", "interturbo": "2", "afformido": "2", "gradatus": "2", "superlabor": "2", "rusticulus": "1", "sequestro": "2", "adico": "2", "factus": "1", "salictarius": "1", "plaustrarius": "2", "hostio": "1", "intritus": "2", "molitio": "1", "uacuo": "1", "seresco": "1", "praepilatus": "1", "cruciarius": "1", "exaduersum": "1", "quadrigarius": "1", "carcerarius": "2", "pauxillulum": "2", "melicus": "2", "intersero": "3", "praepando": "2", "statuarius": "1", "utriculus": "1", "fugiens": "2", "rabo": "2"}

---

## Conversion lemmes liés à la POS

In [56]:
def adj_nom(adj, nom, lemma):
    return [
        (lambda x: x["pos1"] == "N", {"lemma": nom, "pos2": "NOMcom"}),
        (lambda x: x["pos1"] == "A", {"lemma": adj, "pos2": "ADJqua"}),
        {"lemma": lemma+"_", "pos2": "?"}
    ]

def nom_ver(ver, nom, lemma):
    return [
        (lambda x: x["pos1"] == "N", {"lemma": nom, "pos2": "NOMcom"}),
        (lambda x: x["pos1"] == "V", {"lemma": ver, "pos2": "VER"}),
        {"lemma": lemma+"_", "pos2": "?"}
    ]

def adv_ver(ver, adv, lemma):
    return [
        (lambda x: x["pos1"] == "D", {"lemma": adv, "pos2": "ADV"}),
        (lambda x: x["pos1"] == "V", {"lemma": ver, "pos2": "VER"}),
        {"lemma": lemma+"_", "pos2": "?"}
    ]


def adv_pre(adv, pre, lemma, adv_end=""):
    return [
        (lambda x: x["pos1"] == "D", {"lemma": adv, "pos2": "ADV"+adv_end}),
        (lambda x: x["pos1"] == "R", {"lemma": pre, "pos2": "PRE"}),
        {"lemma": lemma+"_", "pos2": "?"}
    ]

def this_or_unk(pos2, newpos, lemma):
    return [
        (lambda x: x["pos2"] == pos2, {"lemma": lemma, "pos2": newpos}),
        {"lemma": lemma[:-1]+"_", "pos2": "?"}
    ]


"""Reltype is a dict of lemma found in Proiel associated with list where:
    - the last element is the default value to write for disambiguation
    - all others are (Lambda -> Bool, Value) where lambda is used for check
"""  

reltype = {
    "sum": [
        (lambda x: "aux" in x["reltype"], {"lemma": "sum2", "pos2": "VERaux"}),
        {"lemma": "sum1", "pos2": "VER"}
    ],
    "eo": [
        (lambda x: "aux" in x["reltype"], {"lemma": "eo2", "pos2": "VERaux"}),
        {"lemma": "eo1", "pos2": "VER"}
    ],
    "ad": [
        (lambda x: "adv" in x["reltype"], {"lemma": "ad1", "po2": "ADV", "morph": "Deg=Pos"}),
        {"lemma": "ad2", "pos2": "PRE"}
    ],
    "cum": [
        (lambda x: x["pos1"] == "R", {"lemma": "cum3", "pos2": "CONsub", "morph": "MORPH=empty"}),
        # We ignore cum1 here, they should probably be checked manually, but there is 5909 cum2 in LASLA,
        #       and 358
        #(lambda x: print(x), None),
        #(lambda x: x["pos1"] == "G", {"lemma": "cum1", "pos": "ADV", "morph": "Deg=Pos"}),
        #(lambda x: "adv" in x["reltype"], {"lemma": "cum1", "pos": "ADV", "morph": "Deg=Pos"}),
        {"lemma": "cum2", "pos2": "PRE"}
    ],
    "et": [
        #(lambda x: print(x), None),
        #  1,388 in LASLA
        (lambda x: x["pos1"] == "D", {"lemma": "et1", "pos2": "ADV", "morph": "Deg=Pos"}),
        # 42,725 in LASLA
        {"lemma": "et2", "pos2": "CONcoo"}
    ],
    "qui": [
        # Counter({'1': 41896, '3': 159, '4': 20, '2': 13}) PROrel, ADVint, ADV, ADVrel
        (lambda x: x["pos2"] == "Pr", {"lemma": "qui1", "pos2": "PROrel"}),
        (lambda x: x["pos2"] == "Pi", {"lemma": "qui3", "pos2": "ADVint"}),
        (lambda x: x["pos2"] == "D-", {"lemma": "qui4", "pos2": "ADV"}),
        # 42,725 in LASLA
        {"lemma": "qui4", "pos2": "ADV", "morph": "Deg=Pos"}
    ],
    "dico": [
        # Counter({'2': 7528, '1': 49})
        {"lemma": "dico2", "pos2": "VER"}
    ],
    "hic": [
        #Counter({'1': 20445, '2': 973})
        (lambda x: x["pos1"] == "D", {"lemma": "hic2", "pos2": "ADV", "morph": "Deg=Pos"}),
        {"lemma": "hic1", "pos2": "PROdem"}
    ],
    "ut" : [
        #Counter({'4': 11219, '1': 3901, '2': 358, '3': 64}) CONsub, ADVrel, ADVint, ADV
        # G-, Dq, Du
        (lambda x: x["pos2"] == "Du", {"lemma": "ut2", "pos2": "ADVint", "morph": "Deg=Pos"}),
        (lambda x: x["pos2"] == "Dq", {"lemma": "ut1", "pos2": "ADVrel", "morph": "Deg=Pos"}),
        (lambda x: x["pos2"] == "G-", {"lemma": "ut4", "pos2": "CONsub", "morph": "MORPH=empty"}),
        {"lemma": "ut3", "pos2": "ADV"}
    ],
    "unde" : [
        (lambda x: x["pos2"] == "Du", {"lemma": "unde2", "pos2": "ADVint", "morph": "Deg=Pos"}),
        (lambda x: x["pos2"] == "Dq", {"lemma": "unde1", "pos2": "ADVrel", "morph": "Deg=Pos"}),
        {"lemma": "unde_", "pos2": "?"}
    ],
    "atque" : [
        (lambda x: x["pos2"] == "Df", {"lemma": "atque1", "pos2": "CONcoo"}),
        (lambda x: x["pos2"] == "C-", {"lemma": "atque1", "pos2": "CONcoo"}),
        (lambda x: x["pos2"] == "G-", {"lemma": "atque2", "pos2": "CONsub"}),
        {"lemma": "atque_", "pos2": "?"}
    ],
    "enim" : [
        # Counter({'2': 4847, '1': 29}) CONcoo, ADV
        # Weirdly, completely unbalanced here, all things are marked df, which is ADVERB
        # Ask C. Nicolas here ?
        #(lambda x: x["pos2"] == "Df", {"lemma": "enim2", "pos": "CONcoo"})
        {"lemma": "enim2", "pos2": "CONcoo"}
    ],
    "quod": [
        # Counter({'2': 3724, '1': 399})  CONsub, CONcoo
        {"lemma": "quod2", "pos2": "CONsub"}
    ],
    "quis": [
        # Counter({'1': 11660, '2': 2268}) PROint, PROind
        (lambda x: x["pos2"] == "Px", {"lemma": "quis2", "pos2": "PROind"}),
        (lambda x: x["pos2"] == "Pi", {"lemma": "quis1", "pos2": "PROint"}),
        (lambda x: x["pos2"] == "Pr", {"lemma": "qui1", "pos2": "PROint"}),  # Erreur d'annotation ?
        {"lemma": "quis_", "pos2": "PROint"}
    ],
    "si": [
        # Counter({'2': 11010, '1': 18}) CONsub, ADVint
        (lambda x: x["pos2"] == "Du", {"lemma": "si1", "pos2": "ADVint"}),
        (lambda x: x["pos2"] == "G-", {"lemma": "si2", "pos2": "CONsub"}),
        {"lemma": "si_", "pos2": "?"}
    ],
    "quasi": [
        # Counter({'2': 11010, '1': 18}) CONsub, ADVint
        (lambda x: x["pos2"] == "Df", {"lemma": "quasi1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "G-", {"lemma": "quasi2", "pos2": "CONsub"}),
        {"lemma": "quasi_", "pos2": "?"}
    ],
    "ne": [
        # Counter({'4': 3083, '3': 1759, '2': 1407, '1': 56}) CONsub, ADVneg, ADVint, ADV
        (lambda x: x["pos2"] == "Du", {"lemma": "ne2", "pos2": "ADVint"}), # Vérifier si c'est -ne2 ?
        (lambda x: x["pos2"] == "Df", {"lemma": "ne3", "pos2": "ADVneg"}), # Supposition vu les balances du LASLA
        (lambda x: x["pos2"] == "G-", {"lemma": "ne4", "pos2": "CONsub"}),
        {"lemma": "ne_", "pos2": "?"}
    ],
    "ergo": [
        # Counter({'2': 1451, '1': 14}) CONcoo, PRE
        (lambda x: x["pos2"] == "Df", {"lemma": "ergo2", "pos2": "CONcoo"}),
        {"lemma": "ergo_", "pos2": "?"}
    ],
    "sicut": [
        # Counter({'1': 397, '2': 2}) ADVrel, CONsub
        (lambda x: x["pos2"] == "Dq", {"lemma": "sicut1", "pos2": "ADVrel"}), # Sûr
        (lambda x: x["pos2"] == "Df", {"lemma": "sicut1", "pos2": "ADVrel"}), 
        # Pas sûr mais en même temps il y a beaucoup plus de sicut dans le NT que dans le LASLA...
        {"lemma": "sicut_", "pos2": "?"}
    ],
    #Counter({'2': 801, '1': 11}) PRE, ADV
    "propter": this_or_unk(pos2="R-", newpos="PRE", lemma="propter2"),
    # Counter({'2': 137, '1': 4}) PRE, ADV
    "secundum": this_or_unk(pos2="R-", newpos="PRE", lemma="secundum2"),
    # Counter({'1': 2109, '2': 70, '3': 1}) PRE, INJ, ADV
    "pro": this_or_unk(pos2="R-", newpos="PRE", lemma="pro1"),
    "manus": adj_nom(adj="manus2", nom="manus1", lemma="manus"),
    "mortuus": adj_nom(adj="mortuus2", nom="mortuus1", lemma="mortuus"),
    "mundus": adj_nom(adj="mundus2", nom="mundus1", lemma="mundus"),
    "princeps": adj_nom(adj="princeps2", nom="princeps1", lemma="princeps"),
    "malus": adj_nom(adj="malus3", nom="malus12", lemma="princeps"),
    "manduco": nom_ver(nom="manduco1", ver="manduco2", lemma="manduco"),
    "caro": nom_ver(nom="caro1", ver="caro2", lemma="caro"),
    "seruus": adj_nom(adj="seruus2", nom="seruus1", lemma="seruus"),
    # Counter({'4': 1048, '3': 557}) CONcoo, ADV, Verb (2), NOMcom(1)
    # Fusionner les uero ?
    "uero": this_or_unk(pos2="Df", newpos="ADV", lemma="uero4"),
    # Counter({'1': 911, '2': 457}) 2 = indecl
    # Fusionner opus dans LASLA ?
    "opus": this_or_unk(pos2="Nb", newpos="NOMcom", lemma="opus1"),
    "intro": adv_ver(adv="intro2", ver="intro1", lemma="intro"),
    "continuo": adv_ver(adv="continuo2", ver="continuo1", lemma="continuo"),
    # Counter({'2': 2208, '1': 96}) CONcoo, ADV
    "at": [
        (lambda x: True, {"lemma": "at2", "pos2": "CONco"}),
        {"lemma": "at_", "pos2": "?"}
    ],
    # 
    "circa": adv_pre(adv="circa1", pre="circa2", lemma="circa"),
    # Counter({'2': 404, '1': 91}) PRE, ADV
    "super": adv_pre(adv="super1", pre="super2", lemma="super"),
    # Counter({'2': 953, '1': 823}) PRE, ADV
    "ante": adv_pre(adv="ante1", pre="ante2", lemma="ante"),
    # Counter({'2': 918, '1': 357}) PRE, ADV
    "post": adv_pre(adv="post1", pre="post2", lemma="post"),
    # Counter({'2': 890, '1': 457})
    "contra": adv_pre(adv="contra1", pre="contra2", lemma="contra"),
    # Counter({'1': 69, '2': 52})
    "iuxta": adv_pre(adv="iuxta1", pre="iuxta2", lemma="iuxta"),
    # Counter({'2': 179, '1': 165})
    "supra": adv_pre(adv="supra1", pre="supra2", lemma="supra"),
    # Counter({'3': 1203, '1': 566, '2': 392}) CONsub, ADVrel, ADVint
    "ubi": [
        (lambda x: x["pos2"] == "Du", {"lemma": "ubi2", "pos2": "ADVint"}), # Sûr
        # Pas sûr mais ADVrel = cas spec de CONsub ?
        (lambda x: x["pos2"] == "Dq", {"lemma": "ubi3", "pos2": "CONsub"}), 
        {"lemma": "ubi_", "pos2": "?"}
    ],
    # Counter({'1': 633, '2': 63}) PROrel, PROind
    "quicumque": [
        (lambda x: x["pos2"] == "Pr", {"lemma": "quicumque1", "pos2": "PROrel"}), # Sûr
        # Pas sûr mais ADVrel = cas spec de CONsub ?
        (lambda x: x["pos2"] == "Px", {"lemma": "quicumque2", "pos2": "PROind"}), 
        {"lemma": "quicumque_", "pos2": "?"}
    ],
    "quam": [
        (lambda x: x["pos2"] == "Du", {"lemma": "quam2", "pos2": "ADVint"}), # Sûr
        {"lemma": "quam1", "pos2": "ADVrel"}
    ],
    "tamquam": [
        (lambda x: x["pos2"] == "Df", {"lemma": "tamquam1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "Dq", {"lemma": "tamquam1", "pos2": "ADVrel"}), 
        (lambda x: x["pos2"] == "G-", {"lemma": "tamquam2", "pos2": "CONsub"}), 
        {"lemma": "tamquam_", "pos2": "?"}
    ],
    "quomodo": [
        (lambda x: x["pos2"] == "Dq", {"lemma": "quomodo1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "Du", {"lemma": "quomodo2", "pos2": "ADVint"}),  
        {"lemma": "quomodo_", "pos2": "?"}
    ],
    "quando": [
        (lambda x: x["pos2"] == "Dq", {"lemma": "quando1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "Du", {"lemma": "quando2", "pos2": "ADVint"}),  
        {"lemma": "quomodo_", "pos2": "?"}
    ],
    "quemadmodum": [
        (lambda x: x["pos2"] == "Dq", {"lemma": "quemadmodum1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "Du", {"lemma": "quemadmodum2", "pos2": "ADVint"}),  
        {"lemma": "quemadmodum1", "pos2": "ADVrel"}
    ],
    "quot": [
        (lambda x: x["pos2"] == "Dq", {"lemma": "quot1", "pos2": "ADVrel"}),
        (lambda x: x["pos2"] == "Du", {"lemma": "quot2", "pos2": "ADVint"}),  
        {"lemma": "quot3", "pos2": "PROind"}
    ],
    # Counter({'1': 65, '2': 56})
    "coram": this_or_unk(pos2="R-", newpos="PRE", lemma="coram2"),
    # Counter({'2': 515, '1': 230})
    "siue": this_or_unk(pos2="C-", newpos="CONcoo", lemma="siue1"),
    # Counter({'2': 1526, '1': 17})
    "dum": this_or_unk(pos2="G-", newpos="CONsub", lemma="dum2"),
    # Counter({'2': 914, '1': 414})
    "tantum": this_or_unk(pos2="Df", newpos="ADV", lemma="tantum2"),
    # Counter({'1': 909, '3': 67, '2': 3})
    "simul": this_or_unk(pos2="Df", newpos="ADV", lemma="simul1"),
    # Counter({'1': 299, '2': 252})
    "quare": this_or_unk(pos2="Du", newpos="ADVint", lemma="quare2"),
    # Counter({'2': 1266, '1': 300}))
    "uel": this_or_unk(pos2="C-", newpos="CONcoo", lemma="uel2"),
    "uelut": this_or_unk(pos2="Df", newpos="ADVrel", lemma="uelut1"),
    "multum": [
        (lambda x: x["pos2"] == "Nb", {"lemma": "multum1", "pos2": "NOMcom"}),
        (lambda x: x["pos1"] == "D", {"lemma": "multum2", "pos2": "ADV"}),  
        {"lemma": "multum_", "pos2": "?"}
    ],
    # Counter({'1': 391, '2': 239})
    "qualis": [
        (lambda x: x["pos2"] == "Pr", {"lemma": "qualis1", "pos2": "PROrel"}),
        (lambda x: x["pos2"] == "Pi", {"lemma": "qualis2", "pos2": "PROint"}),
        {"lemma": "qualis_", "pos2": "?"}
    ],
    "secus": [
        (lambda x: x["pos2"] == "Nb", {"lemma": "secus1", "pos2": "NOMcom"}),
        (lambda x: x["pos1"] == "D", {"lemma": "secus2", "pos2": "ADV"}),  
        {"lemma": "secus3", "pos2": "PRE"}
    ],
    "nequedum": [
        (lambda x: x["pos1"] == "D", {"lemma": "necdum2", "pos2": "ADVneg"}),  
        {"lemma": "nequedum_", "pos2": "?"}
    ],
    "secundo": this_or_unk(pos2="Df", newpos="ADJadv.ord", lemma="secundo1"),
}

---

## Conversion Morphologique

In [57]:
def adj_morph(line, code):
    if (code == "21" and line["Deg"] in {"Sup", "Pos"}) or \
        line["Deg"] == "Sup":
        if line["Case"] in {"Nom", "Voc"}:
            pass  # Keep Gend
        elif line["Case"] in {'Acc', "Gen", "Dat", "Abl"} and line['Numb'] == "Sing":
            if line["Gend"] != "Fem":
                line["Gend"] = "MascNeut"
        elif line["Case"] == "Gen" and line['Numb'] == "Plur":
            if line["Gend"] != "Fem":
                line["Gend"] = "MascNeut"
        elif line["Case"] == "Gen" and line['Numb'] == "Plur":
            if line["Gend"] != "Fem":\
                line["Gend"] = "MascNeut"
        elif line["Case"] in {"Dat", "Abl"} and line['Numb'] == "Plur":
                line["Gend"] = "Com"
    elif code == "23": # Parasyllab
        if line["Case"] in {"Nom", "Voc", "Acc"}:
            if line["Gend"] in {"Masc", "Fem"}:
                line["Gend"] = "MascFem"
        else:
            line["Gend"] = "Com"
    elif code == "22": # 
        if line["Case"] in {"Nom", "Voc", "Acc"}:
            if line["Gend"] in {"Masc", "Fem"}:
                line["Gend"] = "MascFem"
        else:
            line["Gend"] = "Com"
    #    elif code == "22": # UETUS
    #        if line["Case"] in {"Nom", "Voc", "Acc"} \
    #            and line["Numb"] == "Plur":
    #            if line["Gend"] in {"Masc", "Fem"}:
    #                line["Gend"] = "MascFem"
    #        else:
    #            line["Gend"] = "Com"
    return line

def part_gend(line):
    if "Part" == line["Mood"]:
        if "Mood=Part|Tense=Fut" in line["morph"] or "Mood=Part|Tense=Perf" in line["morph"]:
            if line["Case"] in {"Nom", "Voc"}:
                pass  # Keep Gend
            elif line["Case"] in {'Acc', "Gen", "Dat", "Abl"} and line['Numb'] == "Sing":
                if line["Gend"] != "Fem":
                    line["Gend"] = "MascNeut"
            elif line["Case"] == "Gen" and line['Numb'] == "Plur":
                if line["Gend"] != "Fem":
                    line["Gend"] = "MascNeut"
            elif line["Case"] == "Gen" and line['Numb'] == "Plur":
                if line["Gend"] != "Fem":\
                    line["Gend"] = "MascNeut"
            elif line["Case"] in {"Dat", "Abl"} and line['Numb'] == "Plur":
                    line["Gend"] = "Com"
        elif "Mood=Part|Tense=Pre" in line["morph"]:
            if line["Case"] in {"Nom", "Voc", "Acc"}:
                if line["Gend"] in {"Masc", "Fem"}:
                    line["Gend"] = "MascFem"
            else:
                line["Gend"] = "Com"
        join_morph(line)
    return line

In [58]:
_Numb = {"s": "Sing", "p": "Plur", "_": "_"}
_Tense = {
    "_": "_",
    "p": "Pre",
    "i": "Imp",
    "r": "Perf",
    #"s": "resultative",
    #"a": "aorist",
    "u": "past",
    "l": "PqP",
    "f": "Fut",
    "t": "FutAnt"
}
_Mood = {
    "_": "_",
    "i": "Ind",
    "s": "Sub",
    "m": "Imp",
    "n": "Inf",
    "p": "Part",
    "d": "Adj",
    "g": "Ger",
    "u": "SupUm"
}
_Voice = {
    "_": "_",
    "a": "Act",
    "m": "Mid",
    "p": "Pass",
    "e": "MidOrPass",
    "x": "UNK",
}
_Gend = {
    "_": "_",
    "m": "Masc",
    "f": "Fem",
    "n": "Neut",
    "p": "MascFem",
    "o": "MascNeut",
    "r": "FemNeut",
    "q": "Com"
}
_Case = {
    "_": "_",
    "n": "Nom",
    "a": "Acc",
    "o": "Obl",
    "g": "Gen",
    "c": "GenDat",
    "e": "AccDat",
    "d": "Dat",
    "b": "Abl",
    "i": "Ins",
    "l": "Loc",
    "v": "Voc",
    "x": "XXX",
    "z": "-",
}
_Deg = {
    "_": "_",
    "p": "positive",
    "c": "comparative",
    "s": "superlative"
}
def convert_morph(line):
    morph = line["morph"].replace("-", "_")
    if "=" in morph:
        return line
    new = {
        "Person": "_",
        "Numb": "_",
        "Tense": "_",
        "Mood": "_",
        "Voice": "_",
        "Gend": "_",
        "Case": "_",
        "Deg": "_"
    }
    try:
        new["Person"] = morph[0]
        new["Numb"] = _Numb[morph[1]]
        new["Tense"] = _Tense[morph[2]]
        new["Mood"] = _Mood[morph[3]]
        new["Voice"] = _Voice[morph[4]]
        new["Gend"] = _Gend[morph[5]]
        new["Case"] = _Case[morph[6]]
        new["Deg"] = _Deg[morph[7]]
    except:
        print(line)
        raise
    line.update(new)
    #if line["POS"] == "ADJqua":
    #    adj_morph(line, )
    if line["pos2"].startswith("NOM"):
        line["Gend"] = "_"
    join_morph(line)
    part_gend(line)
    return line

def join_morph(line):
    line["morph"] = []
    for key in ("Case", "Numb", "Gend", "Deg", "Mood", "Tense", "Voice", "Person"):
        if line[key] != "_":
            line["morph"].append("{}={}".format(key, line[key]))
    if line["morph"]:
        line["morph"] = "|".join(line["morph"])
    elif line["pos2"] == "ADV":
        line["morph"] = "Deg=Pos"
    else:
        line["morph"] = "MORPH=empty"
    

## Autres situations

- met doit être collé au token précédent

In [59]:
collate_but_only_token = ["met"]

## Conversion !

In [60]:
def normalize(line):
    line["token"] = line["token"].replace("U", "V").replace("v", "u")
    line["lemma"] = line["lemma"].replace("U", "V").replace("v", "u").replace("(", "").replace(")", "")

def disambiguate(line):
    cur_lemma = line["lemma"]
    if cur_lemma in reltype:
        changed = False
        for check, newline in reltype[cur_lemma][:-1]:
            if check(line):
                line.update(newline)
                changed = True
                break
        if not changed:
            line.update(reltype[cur_lemma][-1])
        dispatched[cur_lemma][line["lemma"]] += 1
    elif cur_lemma in direct_equiv:
        annot = direct_equiv[cur_lemma]
        if isinstance(annot, tuple):
            line.update({"lemma": annot[0], "pos2": annot[1]})
        else:
            line.update({"lemma": annot})
    elif cur_lemma in straight:
        line.update({"lemma": cur_lemma+straight[cur_lemma]})
    elif cur_lemma in Alignment:
        line.update({"lemma": Alignment[cur_lemma].lemma, "pos2": Alignment[cur_lemma].pos})
    return line


dispatched = defaultdict(Counter)
lemma_not_found = defaultdict(list)
diff_lemma = []
        
need_new_lemma = [
    NewLemma("Iesus", "NOMpro"), # 857
    NewLemma("Petrus", "NOMpro", 12, info=", i, m."), # 154
    NewLemma("propheta", "NOMcom", 11, info=", ae, m."), # 140
    NewLemma("Hierosolyma", "NOMpro", 13, info=", orum, n."), # 139
    NewLemma("amen", "INJ"), # 139
    NewLemma("Pharisaeus", "ADJqua"), # Modele Phalaceus
    NewLemma("quoadusque", "CONsub")
]
analyzed = [o.lemma for o in need_new_lemma] + [o.lemma for o in Alignment.values()]

import lxml.etree as ET 

with open("nt.convert.tsv", "w") as out:
    with open("latin-nt.xml") as f:
        xml = ET.parse(f)
        header = ["token", "lemma", "pos2", "morph"]
        out.write("form\tlemma\tPOS\tmorph\n")
        for sentence in xml.xpath("//sentence"):
            toks = []
            for token in sentence.xpath("./token[@form]"):
                line = {
                    "morph": token.get("morphology"),
                    "pos2": token.get("part-of-speech"),
                    "pos1": token.get("part-of-speech", "--")[1],
                    "lemma": token.get("lemma"),
                    "token": token.get("form"),
                    "reltype": token.get("relation")
                }
                normalize(line)
                disambiguate(line)
                convert_pos(line)
                convert_morph(line)
                
                if line["lemma"] in default_varia_pos and line["pos2"] in default_varia_pos[line["lemma"]]:
                    print("{}({}) -> {}".format(
                        line["lemma"],
                        line["pos2"],
                        default_varia_pos[line["lemma"]][line["pos2"]]
                    ))
                    line.update({"lemma": default_varia_pos[line["lemma"]][line["pos2"]]})
                    diff_lemma.append(line["lemma"])
                elif line["lemma"] not in LASLA and line["lemma"] not in analyzed and line["pos2"] != "OUT":
                    lemma_not_found[line["lemma"]].append(tokens)
                
                
                if line["lemma"] in {"met"}:
                    toks[-1]["token"] += line["token"]
                else:
                    toks.append(line)
                # End of aligning single token
            # Write sentence
            for token in toks:
                out.write("\t".join(token[key] for key in header)+"\n")
            out.write("\n")

liber(NOMcom) -> liber1
magus(NOMcom) -> magus1
turbo(VER) -> turbo2
paruus(ADJqua) -> paruus2
clam(ADV) -> clam1
magus(NOMcom) -> magus1
magus(NOMcom) -> magus1
infra(ADV) -> infra1
magus(NOMcom) -> magus1
pilus(NOMcom) -> pilus2
intra(PRE) -> intra2
modo(ADV) -> modo1
mando(VER) -> mando3
aduersus(PRE) -> aduersus3
salio(VER) -> salio2
ultra(ADV) -> ultra1
quippe(ADV) -> quippe1
paruus(ADJqua) -> paruus2
paruus(ADJqua) -> paruus2
fatuus(ADJqua) -> fatuus2
aduersus(PRE) -> aduersus3
aduersarius(ADJqua) -> aduersarius2
cito(ADV) -> cito2
aduersarius(ADJqua) -> aduersarius2
minister(NOMcom) -> minister1
abundans(ADJqua) -> abundans2
passus(NOMcom) -> passus1
inimicus(NOMcom) -> inimicus1
inimicus(NOMcom) -> inimicus1
publicanus(NOMcom) -> publicanus1
intra(PRE) -> intra2
claudo(VER) -> claudo2
lauo(VER) -> lauo2
furor(VER) -> furor2
furor(VER) -> furor2
sero(VER) -> sero3
meto(VER) -> meto2
cras(ADV) -> cras2
quanto(ADVint) -> quanto2
paruus(ADJqua) -> paruus2
porrigo(VER) -> porrigo2
p

quocumque(ADVrel) -> quocumque1
lauo(VER) -> lauo2
lauo(VER) -> lauo2
senex(ADJqua) -> senex2
forum(NOMcom) -> forum2
lectus(NOMcom) -> lectus2
senex(ADJqua) -> senex2
ultra(ADV) -> ultra1
extra(PRE) -> extra2
intus(ADV) -> intus1
intus(ADV) -> intus1
gentilis(NOMcom) -> gentilis1
genus(NOMcom) -> genus2
lectus(NOMcom) -> lectus2
seorsum(ADV) -> seorsum1
suspicio(VER) -> suspicio2
quanto(ADVrel) -> quanto1
extra(PRE) -> extra2
Philippus(NOMpro) -> Philippus2
senex(ADJqua) -> senex2
palam(ADV) -> palam1
adulter(ADJqua) -> adulter2
seorsum(ADV) -> seorsum1
ubicumque(ADVrel) -> ubicumque1
quamdiu(ADVint) -> quamdiu2
quamdiu(ADVint) -> quamdiu2
accido(VER) -> accido2
genus(NOMcom) -> genus2
minister(NOMcom) -> minister1
quisquis(PROrel) -> quisquis1
cito(ADV) -> cito2
aduersus(PRE) -> aduersus3
quisquis(PROrel) -> quisquis1
quisquis(PROrel) -> quisquis1
asinarius(ADJqua) -> asinarius2
salio(VER) -> salio2
salio(VER) -> salio2
ultra(PRE) -> ultra2
licet(VER) -> licet1
quisque(PROind) -> qui

colonus(NOMcom) -> colonus1
dilectus(ADJqua) -> dilectus2
colonus(NOMcom) -> colonus1
extra(PRE) -> extra2
colonus(NOMcom) -> colonus1
licet(VER) -> licet1
ultra(ADV) -> ultra1
aequalis(ADJqua) -> aequalis2
uiuus(ADJqua) -> uiuus2
liber(NOMcom) -> liber1
inimicus(NOMcom) -> inimicus1
forum(NOMcom) -> forum2
aduersus(PRE) -> aduersus3
contingo(VER) -> contingo2
aduersarius(ADJqua) -> aduersarius2
parens(NOMcom) -> parens1
cognatus(ADJqua) -> cognatus2
amicus(NOMcom) -> amicus1
exercitus(NOMcom) -> exercitus1
captiuus(NOMcom) -> captiuus1
prae(PRE) -> prae2
prae(PRE) -> prae2
leuo(VER) -> leuo2
contentio(NOMcom) -> contentio2
iuuenis(ADJqua) -> iuuenis2
paratus(ADJqua) -> paratus2
satis(ADV) -> satis2
quantum(ADVrel) -> quantum3
prae(PRE) -> prae2
senex(ADJqua) -> senex2
latro(NOMcom) -> latro1
senex(ADJqua) -> senex2
exercitus(NOMcom) -> exercitus1
amicus(NOMcom) -> amicus1
inimicus(NOMcom) -> inimicus1
uber(NOMcom) -> uber1
latro(NOMcom) -> latro1
latro(NOMcom) -> latro1
centurio(NOMco

quapropter(ADVrel) -> quapropter1
Philippus(NOMpro) -> Philippus2
solum(ADV) -> solum2
paratus(ADJqua) -> paratus2
senex(ADJqua) -> senex2
circumcido(VER) -> circumcido1
extra(PRE) -> extra2
claudo(VER) -> claudo2
centurio(NOMcom) -> centurio1
prae(PRE) -> prae2
castra(NOMcom) -> castra2
contingo(VER) -> contingo2
castra(NOMcom) -> castra2
licet(VER) -> licet1
praesto(VER) -> praesto1
decido(VER) -> decido2
prae(PRE) -> prae2
testis(NOMcom) -> testis2
leuo(VER) -> leuo2
castra(NOMcom) -> castra2
centurio(NOMcom) -> centurio1
licet(VER) -> licet1
centurio(NOMcom) -> centurio1
castra(NOMcom) -> castra2
senex(ADJqua) -> senex2
notus(ADJqua) -> notus2
paratus(ADJqua) -> paratus2
castra(NOMcom) -> castra2
centurio(NOMcom) -> centurio1
seorsum(ADV) -> seorsum1
paratus(ADJqua) -> paratus2
notus(ADJqua) -> notus2
centurio(NOMcom) -> centurio1
exercitus(NOMcom) -> exercitus1
castra(NOMcom) -> castra2
lego(VER) -> lego2
senex(ADJqua) -> senex2
adeo(VER) -> adeo1
aduersus(PRE) -> aduersus3
cito(V

liber(NOMcom) -> liber1
aduersus(PRE) -> aduersus3
liber(ADJqua) -> liber2
tutor(NOMcom) -> tutor1
inimicus(NOMcom) -> inimicus1
modo(ADV) -> modo1
muto(VER) -> muto2
lego(VER) -> lego2
liber(ADJqua) -> liber2
liber(ADJqua) -> liber2
liber(ADJqua) -> liber2
liber(ADJqua) -> liber2
liber(ADJqua) -> liber2
circumcido(VER) -> circumcido1
aduersus(PRE) -> aduersus3
aduersus(PRE) -> aduersus3
contentio(NOMcom) -> contentio2
aduersus(PRE) -> aduersus3
erro(VER) -> erro2
meto(VER) -> meto2
meto(VER) -> meto2
domesticus(ADJqua) -> domesticus2
circumcido(VER) -> circumcido1
conspectus(NOMcom) -> conspectus1
notus(ADJqua) -> notus2
solum(ADV) -> solum2
domesticus(ADJqua) -> domesticus2
minister(NOMcom) -> minister1
paruus(ADJqua) -> paruus2
fundo(VER) -> fundo2
captiuus(ADJqua) -> captiuus2
furor(VER) -> furor2
furor(VER) -> furor2
auarus(ADJqua) -> auarus2
particeps(NOMcom) -> particeps1
subiectus(ADJqua) -> subiectus3
subiectus(ADJqua) -> subiectus3
parens(NOMcom) -> parens1
liber(NOMcom) -> l

## Analyse de la conversion !

In [61]:
from pprint import pprint

pprint(dispatched)

defaultdict(<class 'collections.Counter'>,
            {'ad': Counter({'ad2': 781, 'ad1': 157}),
             'ante': Counter({'ante_': 118}),
             'at': Counter({'at2': 124}),
             'atque': Counter({'atque1': 38}),
             'caro': Counter({'caro_': 120}),
             'circa': Counter({'circa_': 36}),
             'continuo': Counter({'continuo_': 27}),
             'contra': Counter({'contra_': 33}),
             'coram': Counter({'coram2': 61}),
             'cum': Counter({'cum2': 1168}),
             'dico': Counter({'dico2': 2079}),
             'dum': Counter({'dum2': 39}),
             'enim': Counter({'enim2': 743}),
             'eo': Counter({'eo1': 129}),
             'ergo': Counter({'ergo2': 397}),
             'et': Counter({'et2': 8489}),
             'hic': Counter({'hic1': 1204}),
             'intro': Counter({'intro_': 102}),
             'iuxta': Counter({'iuxta_': 23}),
             'malus': Counter({'princeps_': 101}),
             'manduco':

In [62]:
print("{} lemmes non alignés sur {}".format(
    len(set(lemma_not_found)), len(set(diff_lemma))
))
c = 0
def is_INFLn(token):
    INFLn = True
    for tok in lemma_not_found[token]:
        if tok["morph"] != "INFLn":
            INFLn = False
            return INFLn
    return INFLn

counter = Counter([tok["lemma"] for tok_list in lemma_not_found.values() for tok in tok_list])
for l, lc in sorted(counter.items(), key=lambda x: x[1], reverse=True):
    lemma = ", ".join(list(set(tok["token"] for tok in lemma_not_found[l])))
    pos = ", ".join(list(set([t["pos2"] for t in lemma_not_found[l]])))
    print("{}\t{}\t{}\t{}".format(l, pos, lc, lemma))
print(c)

31 lemmes non alignés sur 237


KeyError: 'lemma'