## Lemmatization

CLTK uses a backoff lemmatizer consisting of a chain of different lemmatizers:

1) dictionary-based lemmatizer with high-frequency

2) a training-data-based lemmatizer based on 4,000 sentences from the [Perseus Latin Dependency Treebanks](https://perseusdl.github.io/treebank_data/)

3) a regular-expression-based lemmatizer stripping word affixes 

4) a dictionary-based lemmatizer with the complete set of Morpheus lemmas

5) an ‘identity’ lemmatizer returning the token as the lemma


In [1]:
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

In [2]:
lemmatizer = BackoffLatinLemmatizer()

In [3]:
from  data.word_tokenized_text import word_tokenized_text as text

In [4]:
lemmatized = lemmatizer.lemmatize(text)

In [5]:
lemmatized[:30]

[('Conditi', 'Conditi'),
 ('paradoxi', 'paradoxus'),
 ('compositio', 'compositio'),
 ('mellis', 'mel'),
 ('pondo', 'pondo'),
 ('XV', 'XV'),
 ('in', 'in'),
 ('aeneum', 'aeneus'),
 ('vas', 'vas'),
 ('mittuntur', 'mitto'),
 ('praemissis', 'praemitto'),
 ('vini', 'vinum'),
 ('sextariis', 'sextarius'),
 ('duobus', 'duo'),
 ('ut', 'ut'),
 ('in', 'in'),
 ('coctura', 'coquo'),
 ('mellis', 'mel'),
 ('vinum', 'vinum'),
 ('decoquas', 'decoquo'),
 ('quod', 'qui'),
 ('igni', 'ignis'),
 ('lento', 'lentus'),
 ('et', 'et'),
 ('aridis', 'aridus'),
 ('lignis', 'lignum'),
 ('calefactum', 'calefacio'),
 ('commotum', 'commoveo'),
 ('ferula', 'ferula'),
 ('dum', 'dum')]

In [6]:
line_lemmatized_text = []

In [7]:
from data.line_tokenized_text import line_tokenized_text

In [8]:
from cltk.stem.latin.j_v import JVReplacer

In [9]:
for line in line_tokenized_text:
    _line = lemmatizer.lemmatize(JVReplacer().replace(line).split(" "))
    line_lemmatized_text.append(_line)

In [10]:
line_lemmatized_text[0]

[('Conditi', 'Conditi'),
 ('paradoxi', 'paradoxus'),
 ('compositio', 'compositio'),
 ('mellis', 'mel'),
 ('pondo', 'pondo'),
 ('XU', 'XU'),
 ('in', 'in'),
 ('aeneum', 'aeneus'),
 ('uas', 'uas'),
 ('mittuntu', 'mittuntu'),
 ('praemissis', 'praemitto'),
 ('uini', 'uinum'),
 ('sextariis', 'sextarius'),
 ('duobu', 'duobu'),
 ('ut', 'ut'),
 ('in', 'in'),
 ('coctura', 'coquo'),
 ('mellis', 'mel'),
 ('uinum', 'uinum'),
 ('decoquas', 'decoquo')]

In [11]:
from data.ingredient_list import ingredients

In [12]:
ingredients = set(ingredients)

In [13]:
ingredient_indices = []

for i, lemma in enumerate(lemmatized):
    if lemma[1] in ingredients:
        ingredient_indices.append(i)

In [14]:
ingredient_indices[:30]

[3,
 11,
 17,
 18,
 34,
 83,
 88,
 102,
 119,
 122,
 137,
 140,
 147,
 152,
 185,
 206,
 214,
 221,
 234,
 247,
 256,
 280,
 307,
 314,
 322,
 334,
 357,
 360,
 388,
 394]