In [None]:
# This notebook creates a word2vec model using the Bamman 2012 corpus lemmatized with TreeTagger

In [None]:
# Imports

import os
import time
import multiprocessing

import gensim
from gensim.models import Word2Vec

from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

from tqdm.notebook import tqdm

In [None]:
# Set up NLP tools

replacer = JVReplacer()
lemmatizer = BackoffLatinLemmatizer()
tokenizer = TokenizeSentence('latin')

In [None]:
%%capture

# Specific imports

import treetaggerwrapper

# Create Latin tagger
# NB: TreeTagger must be installed for this to work properly;
# cf. https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/ and https://treetaggerwrapper.readthedocs.io/en/latest/
# Using the Latin parameter file at https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/latin.par.gz

TT = treetaggerwrapper.TreeTagger(TAGLANG='la', TAGOPT='-token -lemma -sgml -quiet')

def lemmatize(text):
    lemmas = []
    tags = TT.tag_text(text)    
    for tag in tags:
        if '\t' in tag:
            lemmas.append(tag.split('\t')[2].replace('-a',''))
        else:
            lemmas.append('<unknown>')
    return lemmas

In [None]:
# Preprocess texts

def preprocess(text):
    import html
    import string
    import re
        
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)

    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    return text

In [None]:
# This step requires the Bamman corpus (latin_txt.tar.gz) to be downloaded, placed in the folder ../models/data/, 
# uncompressed; i.e. there should be a folder of files named ../models/data/latin_txt. The Bamman corpus can be 
# downloaded from https://docs.google.com/uc?id=0B5pGKi0iCsnbZEdHZ3N6d216am8&export=download; see more at:
# http://www.cs.cmu.edu/~dbamman/latin.html

In [None]:
#helper iterator class to process raw text and to handle file by file. Avoids memory issues. 

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    
    def __iter__(self):
        for fname in tqdm(os.listdir(self.dirname)):
            with open(os.path.join(self.dirname, fname), encoding='utf-8') as file:
                sents = tokenizer.tokenize_sentences(file.read().replace('\n', ''))
                for sent in sents:
                    sent = preprocess(sent)
                    lemmas = lemmatize(sent)
                    yield lemmas

In [None]:
%%time

# Build Latin word2vec on Bamman data

cores = multiprocessing.cpu_count()

latin_w2v_model = Word2Vec(MySentences("../models/data/latin_txt"), size = 50, min_count=100, workers=cores-1, iter =1)

In [None]:
latin_w2v_model.save("../models/latin_w2v_bamman_lemma_tt")