In [11]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'E:/Dissertation Data'

os.chdir(path+'/Code')
os.getcwd()

'C:\\Users\\User\\Google Drive\\University\\Dissertation\\Code'

In [12]:
import pickle

import re
import numpy as np
import pandas as pd

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import MWETokenizer, WhitespaceTokenizer

from glove import Corpus, Glove
from scipy import sparse
from gensim import corpora

from generators import sent_gen, word_gen, Sent_Seq

In [13]:
simp = PlaintextCorpusReader(datapath+'/Corpora/wiki/simple_20200601/','simple_20200601_v2.txt',
                            word_tokenizer = WhitespaceTokenizer()
                            )

In [14]:
min_freq = 20
eval_count = 150000

In [15]:
ngram_eval = pd.read_pickle(datapath+'/Corpora/wiki/simple_20200601/ngram_eval.pkl')

In [16]:
ngram_eval.batch.value_counts()

 1     45301
 2     33312
 3     24686
 4     16668
 5     10517
 6      7502
 7      4240
 8      2702
 9      1905
 10     1051
-2       832
 11      450
-1       300
 12      247
 13      123
 14      107
 15       57
Name: batch, dtype: int64

In [17]:
batch_count = max(ngram_eval.batch)
batch_count

15

In [18]:
def invert_dict(dic,shift=0):
    return {value:key+shift for (key, value) in dic.items()}

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

In [19]:
#batch_count = 1

In [21]:
%%time

batch_dfs = {}

for bb in range(batch_count):
    print('Processing batch {} of {}'.format(bb+1,batch_count))
    
    # Subset DataFrame
    batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)
    
    # Initialise MWETokenizer
    batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')
    
    
    print(' Loading dictionary')
    #  Load pickled dictionary
    with open(datapath+'/Corpora/wiki/simple_20200601/Pickles/simp_b{}_dict.pkl'.format(bb+1), 'rb') as pfile:
        batch_dict = pickle.load(pfile)
        
    #  Invert, remove '<unk>', shift indices by -1
    batch_dict = removekey(invert_dict(batch_dict, shift=-1), '<unk>')
    
    print(' Fitting matrix')
    simp_corp = Corpus(batch_dict)

    sents_mwe = Sent_Seq(simp, batch_token_mwe)
    
    # Ignore missing flag set, as vocab restricted
    simp_corp.fit( sents_mwe , window = 10, ignore_missing=True)
    
    
    print(' Building GloVe model')
    model = Glove(no_components=300, 
                  alpha = 0.75, 
                  max_count = 100,
                  learning_rate=0.05)
    
    #model.fit(simp_corp.matrix, epochs=25, no_threads=8, verbose=True)
    model.fit(simp_corp.matrix, epochs=25, no_threads=8, verbose=True)
    
    print(' Adding dictionary')
    model.add_dictionary(simp_corp.dictionary)
    
    # Save model
    print(' Saving model')
    model.save(datapath+'/Models/2 GloVe/simp_glove_vocab_batch{}.model'.format(bb+1))   



Processing batch 2 of 15
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
 Adding dictionary
 Saving model
Processing batch 3 of 15
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
 Adding dictionary
 Saving model
Processing batch 4 of 15
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoc

In [11]:
# Original COO matrix
# model.most_similar('relationships', number=6)

[('banks', 0.9705410874528305),
 ('Eurovision', 0.9661675535912082),
 ('museums', 0.9645120915366855),
 ('São', 0.9575681776302074),
 ('Bar', 0.9561931570169429)]

In [28]:
# Without 0th row and column
#model.most_similar('relationships', number=6)

[('Chris+Thile', 0.2778770494212326),
 ('was+travelling', 0.23399237828058395),
 ('parliamentary', 0.23374034078911354),
 ('the+Way+of', 0.233284549898124),
 ('Gluttony', 0.22860027521751752)]

In [18]:
# COO matrix self-built, restricted vocab
#model.most_similar('relationships', number=6)

[('proteins', 0.9513887932507497),
 ('spots', 0.9428021658554301),
 ('daughter', 0.9409947349321203),
 ('Wilson', 0.9367265613197017),
 ('services', 0.9348989287751218)]

In [22]:
# COO matrix self-built, restricted vocab, 50 epochs
#model.most_similar('relationships', number=6)

[('macron', 0.9496588178319133),
 ('create', 0.9418371012697576),
 ('stretched', 0.9202446780377442),
 ('practical', 0.9135404567297558),
 ('thinks', 0.9135348377719998)]

In [25]:
model.most_similar('relationship', number=6)

[('drugs', 0.9400106111482155),
 ('pool', 0.9391819736329861),
 ('Doctor-patient', 0.9352443219535114),
 ('supporters', 0.9334725339170894),
 ('represent', 0.915237606680276)]

In [26]:
batch_dict['banana']

257300

In [22]:
model.most_similar('large', number=6)

[('school', 0.9631078551638277),
 ('area', 0.9620647900788166),
 ('city', 0.9580584573444334),
 ('population', 0.9562267597474459),
 ('number', 0.9552060755612798)]

In [23]:
model.most_similar('great', number=6)

[('often', 0.9554819809141448),
 ('most', 0.9518745369866642),
 ('special', 0.9518597809125052),
 ('time', 0.9503564894747791),
 ('later', 0.9496059772913714)]