In [None]:
# This notebook creates a word2vec model using the Bamman 2012 corpus lemmatized with the CLTK BackoffLatinLemmatizer

In [None]:
# Imports

import html
import string
import re
import os
import time
import multiprocessing

import collections

import gensim
from gensim.models import Word2Vec, FastText

from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.sentence import TokenizeSentence
from nltk.tokenize import PunktSentenceTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer

from matplotlib import pyplot

from pprint import pprint
import pickle

from nltk import word_tokenize

In [None]:
# Set up NLP tools

replacer = JVReplacer()
tokenizer = PunktSentenceTokenizer()
lemmatizer = BackoffLatinLemmatizer()

In [None]:
# Preprocess texts

def preprocess(text):
        
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)

    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    return text

In [None]:
#helper iterator class to process raw text and to handle file by file. Avoids memory issues. 
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    
    def __iter__(self):
        tokenizer = TokenizeSentence('latin')
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname), encoding='utf-8') as file:
                sents = tokenizer.tokenize_sentences(file.read().replace('\n', ''))
                sents = [[token[1] for token in lemmatizer.lemmatize(preprocess(sent).split())] for sent in sents]
                for sent in sents:
                    yield sent

In [None]:
%%time

# Build Latin word2vec on Bamman data

cores = multiprocessing.cpu_count()

latin_w2v_model = Word2Vec(MySentences("../models/data/latin_txt"), size = 300, min_count=100, workers=cores-1, iter=1)

In [None]:
latin_w2v_model.save("../models/latin_w2v_bamman_lemma300_100_1")