## Lemmatization

CLTK uses a backoff lemmatizer consisting of a chain of different lemmatizers:

1) dictionary-based lemmatizer with high-frequency words

2) a training-data-based lemmatizer based on 4,000 sentences from the [Perseus Latin Dependency Treebanks](https://perseusdl.github.io/treebank_data/)

3) a regular-expression-based lemmatizer stripping word affixes 

4) a dictionary-based lemmatizer with the complete set of Morpheus lemmas

5) an ‘identity’ lemmatizer returning the token as the lemma


In [1]:
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

In [2]:
lemmatizer = BackoffLatinLemmatizer()

In [3]:
from  data.word_tokenized_text import word_tokenized_text as text

In [4]:
lemmatized = lemmatizer.lemmatize(text)

In [5]:
lemmatized[:30]

[('Conditi', 'Conditi'),
 ('paradoxi', 'paradoxus'),
 ('compositio', 'compositio'),
 ('mellis', 'mel'),
 ('pondo', 'pondo'),
 ('XV', 'XV'),
 ('in', 'in'),
 ('aeneum', 'aeneus'),
 ('vas', 'vas'),
 ('mittuntur', 'mitto'),
 ('praemissis', 'praemitto'),
 ('vini', 'vinum'),
 ('sextariis', 'sextarius'),
 ('duobus', 'duo'),
 ('ut', 'ut'),
 ('in', 'in'),
 ('coctura', 'coquo'),
 ('mellis', 'mel'),
 ('vinum', 'vinum'),
 ('decoquas', 'decoquo'),
 ('quod', 'qui'),
 ('igni', 'ignis'),
 ('lento', 'lentus'),
 ('et', 'et'),
 ('aridis', 'aridus'),
 ('lignis', 'lignum'),
 ('calefactum', 'calefacio'),
 ('commotum', 'commoveo'),
 ('ferula', 'ferula'),
 ('dum', 'dum')]

In [6]:
line_lemmatized_text = []

In [7]:
from data.line_tokenized_text import line_tokenized_text

In [8]:
from cltk.stem.latin.j_v import JVReplacer

In [9]:
for line in line_tokenized_text:
    _line = lemmatizer.lemmatize(JVReplacer().replace(line).split(" "))
    line_lemmatized_text.append(_line)

In [10]:
line_lemmatized_text[0]

[('Conditi', 'Conditi'),
 ('paradoxi', 'paradoxus'),
 ('compositio', 'compositio'),
 ('mellis', 'mel'),
 ('pondo', 'pondo'),
 ('XU', 'XU'),
 ('in', 'in'),
 ('aeneum', 'aeneus'),
 ('uas', 'uas'),
 ('mittuntu', 'mittuntu'),
 ('praemissis', 'praemitto'),
 ('uini', 'uinum'),
 ('sextariis', 'sextarius'),
 ('duobu', 'duobu'),
 ('ut', 'ut'),
 ('in', 'in'),
 ('coctura', 'coquo'),
 ('mellis', 'mel'),
 ('uinum', 'uinum'),
 ('decoquas', 'decoquo')]

In [11]:
from data.ingredient_list import ingredients

In [12]:
ingredients = set(ingredients)

In [13]:
ingredient_indices = []

for i, lemma in enumerate(lemmatized):
    if lemma[1] in ingredients:
        ingredient_indices.append(i)

In [14]:
ingredient_indices[:30]

[3,
 11,
 17,
 18,
 34,
 83,
 88,
 102,
 119,
 122,
 137,
 140,
 147,
 152,
 185,
 206,
 214,
 221,
 234,
 247,
 256,
 280,
 307,
 314,
 322,
 334,
 357,
 360,
 388,
 394]

In [19]:
for i, lemma in enumerate(lemmatized):
    if lemma[1] in ingredients:
        print(i, lemmatized[i])

3 ('mellis', 'mel')
11 ('vini', 'vinum')
17 ('mellis', 'mel')
18 ('vinum', 'vinum')
34 ('vini', 'vinum')
83 ('vino', 'vinum')
88 ('vini', 'vinum')
102 ('vini', 'vinum')
119 ('piper', 'piper')
122 ('melle', 'mel')
137 ('mellis', 'mel')
140 ('vinum', 'vinum')
147 ('vini', 'vinum')
152 ('mellis', 'mel')
185 ('vini', 'vinum')
206 ('vino', 'vinum')
214 ('vino', 'vinum')
221 ('vino', 'vinum')
234 ('vino', 'vinum')
247 ('vinum', 'vinum')
256 ('melle', 'mel')
280 ('melle', 'mel')
307 ('mel', 'mel')
314 ('oleo', 'oleum')
322 ('lauri', 'laurus')
334 ('sales', 'sal')
357 ('faba', 'faba')
360 ('ovorum', 'ovum')
388 ('lauro', 'laurus')
394 ('liquamen', 'liquamen')
401 ('mellis', 'mel')
419 ('melle', 'mel')
445 ('aceto', 'acetum')
446 ('sale', 'sal')
447 ('melle', 'mel')
464 ('lacte', 'lac')
477 ('aceto', 'acetum')
482 ('aceto', 'acetum')
485 ('aceto', 'acetum')
490 ('ostrea', 'ostreum')
529 ('farinam', 'farina')
533 ('melle', 'mel')
558 ('melle', 'mel')
644 ('mel', 'mel')
646 ('defritum', 'defritum

3728 ('piper', 'piper')
3740 ('lacte', 'lac')
3745 ('echinos', 'echinus')
3764 ('porros', 'porrus')
3767 ('holus', 'holus')
3797 ('piscis', 'piscis')
3800 ('urticas', 'urtica')
3803 ('ostreorum', 'ostreum')
3804 ('caseos', 'caseus')
3810 ('piper', 'piper')
3816 ('piper', 'piper')
3817 ('ligusticum', 'ligusticum')
3818 ('apii', 'apium')
3844 ('echinos', 'echinus')
3846 ('piper', 'piper')
3858 ('piscium', 'piscis')
3880 ('oleo', 'oleum')
3883 ('piper', 'piper')
3884 ('ligusticum', 'ligusticum')
3886 ('liquamen', 'liquamen')
3887 ('vinum', 'vinum')
3956 ('piper', 'piper')
3962 ('ovis', 'ovum')
3986 ('piscium', 'piscis')
4007 ('piper', 'piper')
4008 ('ligusticum', 'ligusticum')
4011 ('liquamen', 'liquamen')
4012 ('vinum', 'vinum')
4014 ('oleum', 'oleum')
4042 ('oleum', 'oleum')
4065 ('piper', 'piper')
4083 ('melle', 'mel')
4085 ('liquamine', 'liquamen')
4086 ('lacte', 'lac')
4087 ('ovis', 'ovum')
4091 ('oleo', 'oleum')
4100 ('oleo', 'oleum')
4103 ('cerebella', 'cerebellum')
4106 ('piscium'

6749 ('cepam', 'caepa')
6751 ('coriandrum', 'coriandrum')
6754 ('liquamen', 'liquamen')
6755 ('vino', 'vinum')
6757 ('liquamine', 'liquamen')
6764 ('oleum', 'oleum')
6769 ('oleum', 'oleum')
6787 ('porri', 'porrus')
6789 ('coriandri', 'coriandrum')
6794 ('piper', 'piper')
6795 ('ligusticum', 'ligusticum')
6805 ('liquamine', 'liquamen')
6810 ('oleum', 'oleum')
6824 ('piper', 'piper')
6825 ('ligusticum', 'ligusticum')
6826 ('anethum', 'anethum')
6827 ('cepam', 'caepa')
6830 ('liquamen', 'liquamen')
6831 ('vino', 'vinum')
6833 ('liquamine', 'liquamen')
6866 ('liquamine', 'liquamen')
6867 ('oleo', 'oleum')
6869 ('vino', 'vinum')
6872 ('cepam', 'caepa')
6873 ('coriandrum', 'coriandrum')
6875 ('cerebella', 'cerebellum')
6889 ('cepam', 'caepa')
6891 ('coriandrum', 'coriandrum')
6906 ('piper', 'piper')
6965 ('cerebella', 'cerebellum')
6970 ('piper', 'piper')
6971 ('ligusticum', 'ligusticum')
6974 ('gingiber', 'gingiber')
6975 ('liquamen', 'liquamen')
6979 ('vino', 'vinum')
6995 ('pullo', 'pullo

8697 ('piper', 'piper')
8708 ('vulvas', 'volva')
8717 ('aceto', 'acetum')
8719 ('liquamine', 'liquamen')
8723 ('vulva', 'volva')
8726 ('piper', 'piper')
8727 ('apii', 'apium')
8729 ('mentam', 'menta')
8732 ('radicem', 'radix')
8733 ('mel', 'mel')
8734 ('acetum', 'acetum')
8736 ('liquamen', 'liquamen')
8741 ('liquamine', 'liquamen')
8747 ('piper', 'piper')
8748 ('liquamine', 'liquamen')
8758 ('piper', 'piper')
8759 ('liquamine', 'liquamen')
8782 ('sale', 'sal')
8793 ('piper', 'piper')
8794 ('ligusticum', 'ligusticum')
8795 ('liquamen', 'liquamen')
8807 ('piper', 'piper')
8808 ('careum', 'careum')
8822 ('piper', 'piper')
8824 ('ligusticum', 'ligusticum')
8825 ('liquamen', 'liquamen')
8826 ('vinum', 'vinum')
8828 ('oleum', 'oleum')
8836 ('liquamine', 'liquamen')
8838 ('piper', 'piper')
8839 ('ligusticum', 'ligusticum')
8840 ('bacas', 'baca')
8841 ('lauri', 'laurus')
8866 ('piper', 'piper')
8867 ('ligusticum', 'ligusticum')
8868 ('anethum', 'anethum')
8871 ('bacam', 'baca')
8872 ('lauri', 

10627 ('apro', 'aper')
10632 ('lauri', 'laurus')
10633 ('aprum', 'aper')
10641 ('sale', 'sal')
10643 ('aceto', 'acetum')
10647 ('apro', 'aper')
10649 ('piper', 'piper')
10650 ('ligusticum', 'ligusticum')
10652 ('bacas', 'baca')
10655 ('coriandrum', 'coriandrum')
10656 ('cepas', 'caepa')
10658 ('mel', 'mel')
10659 ('vinum', 'vinum')
10660 ('liquamen', 'liquamen')
10661 ('oleum', 'oleum')
10666 ('aprum', 'aper')
10680 ('aprum', 'aper')
10686 ('piper', 'piper')
10689 ('apii', 'apium')
10691 ('mentam', 'menta')
10693 ('satureiam', 'satureia')
10701 ('mel', 'mel')
10702 ('vinum', 'vinum')
10703 ('liquamen', 'liquamen')
10705 ('oleum', 'oleum')
10709 ('aprum', 'aper')
10713 ('piper', 'piper')
10714 ('ligusticum', 'ligusticum')
10715 ('apii', 'apium')
10717 ('mentam', 'menta')
10721 ('vinum', 'vinum')
10722 ('acetum', 'acetum')
10723 ('liquamen', 'liquamen')
10725 ('oleum', 'oleum')
10737 ('cepam', 'caepa')
10739 ('rutae', 'ruta')
10740 ('fasciculos', 'fasciculus')
10749 ('ovorum', 'ovum')
10

12965 ('liquamen', 'liquamen')
12966 ('acetum', 'acetum')
12968 ('cepam', 'caepa')
12981 ('piper', 'piper')
12985 ('iecur', 'jecur')
12987 ('liquamen', 'liquamen')
12990 ('olei', 'oleum')
13003 ('haedum', 'haedus')
13009 ('piper', 'piper')
13011 ('satureiam', 'satureia')
13012 ('cepam', 'caepa')
13015 ('liquamine', 'liquamen')
13028 ('cepam', 'caepa')
13029 ('satureiam', 'satureia')
13039 ('vinum', 'vinum')
13040 ('oleum', 'oleum')
13041 ('liquamen', 'liquamen')
13062 ('vino', 'vinum')
13063 ('liquamine', 'liquamen')
13067 ('anetho', 'anethum')
13076 ('piper', 'piper')
13077 ('satureiam', 'satureia')
13078 ('cepae', 'caepa')
13083 ('vinum', 'vinum')
13084 ('liquamen', 'liquamen')
13086 ('olei', 'oleum')
13093 ('lepus', 'lepus')
13111 ('liquamine', 'liquamen')
13113 ('glires', 'glis')
13136 ('cepam', 'caepa')
13141 ('piper', 'piper')
13142 ('ligusticum', 'ligusticum')
13143 ('careum', 'careum')
13145 ('caryotam', 'caryota')
13146 ('mel', 'mel')
13147 ('acetum', 'acetum')
13148 ('vinum',