In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
import pickle
import nltk
from nltk.corpus import stopwords
import time
import os.path
from gensim.test.utils import get_tmpfile

start = time.time()

print("Word Embedding Checker")
print("----------------------")
print()

print("* Building index of documents...")

# List all documents in directory
path = "../inputdata/full_texts_all_cases/"

# Import stopwords           
stopwordsfile = "../script_resources/stopwords.pickle"
stopwords_full = []
with open(stopwordsfile, "rb") as f:
    tmp = pickle.load(f)
    stopwords_full.extend(list(tmp))
    stopwords_full.extend(stopwords.words('english'))
    
stopwords_full = list(set(stopwords_full))

#print(stopwords_full)

# Only keep celex number from filename
def cleanfilename(name):
    result = ""
    result = name.replace("full_text_","")
    result = result.replace(".txt","")
    return result

def removeStopWords(text, stopwords_list):
    text = text.lower()
    for item in stopwords_list:
        text = text.replace(" " + item.lower() + " "," ")
        text = text.replace(" " + item.lower() + ","," ")
        text = text.replace(" " + item.lower() + "."," ")
        text = text.replace(" " + item.lower() + ";"," ")
    text = text.replace("+","")
    return text

# Import files and define mapping between case IDS and full texts   
files = []
index_to_celex = {}
datafortraining = []
index = 0
for r, d, f in os.walk(path):
    for file in f:
        if '.txt' in file:
            files.append(os.path.join(r, file))
            celexnum = cleanfilename(os.path.basename(file))
            with open (path+file, "r", encoding="utf-8") as myfile:
                data = myfile.read().replace('\n', '')
                data = removeStopWords(data,stopwords_full)
                datafortraining.append(data)
                index_to_celex[index] = file
                index += 1

documents = [TaggedDocument(file, [i]) for i, file in enumerate(datafortraining)]

print(" Index successfully built!")
print()


# ### Step 2. Train Doc2Vec model on input case texts

# In[ ]:

print("* Loading / training Doc2Vec models...")

model_64 = None
model_128 = None
model_256 = None

model_64_10 = None
model_128_10 = None
model_256_10 = None

fname_64 = None
fname_128 = None
fname_256 = None

fname_64_10 = None
fname_128_10 = None
fname_256_10 = None

print()
print("* Window size: 5")
print()
print("* Vector size 64")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_e5.model")):
    print(" loading model from file...")
    fname_64 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_e5.model"))
    model_64 = Doc2Vec.load(fname_64)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_64 = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4)
    model_64.train(documents, total_examples=model_64.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_64 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_e5.model"))
    model_64.save(fname_64)
    print(" successfully saved model!")
    
print()

print("* Vector size 128")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_e5.model")):
    print(" loading model from file...")
    fname_128 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_e5.model"))
    model_128 = Doc2Vec.load(fname_128)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_128 = Doc2Vec(documents, vector_size=128, window=5, min_count=1, workers=4)
    model_128.train(documents, total_examples=model_128.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_128 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_e5.model"))
    model_128.save(fname_128)
    print(" successfully saved model!")

print()
    
print("* Vector size 256")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_e5.model")):
    print(" loading model from file...")
    fname_256 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_e5.model"))
    model_256 = Doc2Vec.load(fname_256)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_256 = Doc2Vec(documents, vector_size=256, window=5, min_count=1, workers=4)
    model_256.train(documents, total_examples=model_256.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_256 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_e5.model"))
    model_256.save(fname_256)
    print(" successfully saved model!")
print()


print()
print("* Window size: 10")
print()
print("* Vector size 64")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_10_e5.model")):
    print(" loading model from file...")
    fname_64_10_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_10_e5.model"))
    model_64_10_10 = Doc2Vec.load(fname_64_10)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_64_10 = Doc2Vec(documents, vector_size=64, window=10, min_count=1, workers=4)
    model_64_10.train(documents, total_examples=model_64_10.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_64_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_64_10_e5.model"))
    model_64_10.save(fname_64_10)
    print(" successfully saved model!")
    
print()

print("* Vector size 128")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_10_e5.model")):
    print(" loading model from file...")
    fname_128_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_10_e5.model"))
    model_128_10 = Doc2Vec.load(fname_128_10)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_128_10 = Doc2Vec(documents, vector_size=128, window=10, min_count=1, workers=4)
    model_128_10.train(documents, total_examples=model_128_10.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_128_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_128_10_e5.model"))
    model_128_10.save(fname_128_10)
    print(" successfully saved model!")

print()
    
print("* Vector size 256")
if os.path.exists(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_10_e5.model")):
    print(" loading model from file...")
    fname_256_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_10_e5.model"))
    model_256_10 = Doc2Vec.load(fname_256_10)
    print(" successfully loaded model!")
else:
    print(" training model...")
    model_256_10 = Doc2Vec(documents, vector_size=256, window=10, min_count=1, workers=4)
    model_256_10.train(documents, total_examples=model_256_10.corpus_count,epochs=5)
    print(" successfully trained model!")
    print(" saving model to file...")
    fname_256_10 = get_tmpfile(os.path.join(os.path.join(os.path.realpath('..'), "script_resources"), "doc2vec_256_10_e5.model"))
    model_256_10.save(fname_256_10)
    print(" successfully saved model!")
print()