In [1]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'E:/Dissertation Data'

os.chdir(path+'/Code')
os.getcwd()

'C:\\Users\\User\\Google Drive\\University\\Dissertation\\Code'

In [2]:
import pickle

import re
import numpy as np
import pandas as pd

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import MWETokenizer, WhitespaceTokenizer

from glove import Corpus, Glove
from scipy import sparse
from gensim import corpora

from generators import sent_gen, word_gen, Sent_Seq

In [3]:
w10p = PlaintextCorpusReader(datapath+'/Corpora/wiki/enwiki_20200520/','enwiki_20200520_10pc.txt',
                            word_tokenizer = WhitespaceTokenizer()
                            )

In [4]:
min_freq = 20
eval_count = 500000

In [5]:

ngram_eval = pd.read_pickle(datapath+'/Corpora/wiki/enwiki_20200520/10pc_ngram_eval.pkl')

In [6]:
ngram_eval.batch.value_counts()

 1     94011
 2     79538
 3     69221
 4     56154
 5     48934
 6     38288
 7     30374
-1     29076
 8     22495
 9     19272
 10    11300
-2      1337
Name: batch, dtype: int64

In [16]:
batch_count = max(ngram_eval.batch)
batch_count

10

In [8]:
def invert_dict(dic,shift=0):
    return {value:key+shift for (key, value) in dic.items()}

def removekey(d, key):
    r = dict(d)
    del r[key]
    return r

In [9]:
#batch_count = 1

In [17]:
%%time

batch_dfs = {}

for bb in range(batch_count):
    print('Processing batch {} of {}'.format(bb+1,batch_count))
    
    # Subset DataFrame
    batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)
    
    # Initialise MWETokenizer
    batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')
    
    #  Load pickled vocab set
    with open(datapath+'/Corpora/wiki/enwiki_20200520/10pc_b{}vocab.pkl'.format(bb+1), 'rb') as pfile:
        batch_vocab = pickle.load(pfile)
    
    print(' Loading dictionary')
    #  Load pickled dictionary
    with open(datapath+'/Corpora/wiki/enwiki_20200520/Pickles/10pc_b{}_dict.pkl'.format(bb+1), 'rb') as pfile:
        batch_dict = pickle.load(pfile)
        
    #  Invert, remove '<unk>', shift indices by -1
    batch_dict = removekey(invert_dict(batch_dict, shift=-1), '<unk>')
    
    print(' Fitting matrix')
    w10p_corp = Corpus(batch_dict)

    sents_mwe = Sent_Seq(w10p, batch_token_mwe, vocab=batch_vocab)
    
    # Ignore missing flag set, as vocab restricted
    w10p_corp.fit( sents_mwe , window = 10, ignore_missing=True)
    
    
    print(' Building GloVe model')
    model = Glove(no_components=300, 
                  alpha = 0.75, 
                  max_count = 100,
                  learning_rate=0.05)
    
    #model.fit(simp_corp.matrix, epochs=25, no_threads=8, verbose=True)
    model.fit(w10p_corp.matrix, epochs=25, no_threads=8, verbose=True)
    
    print(' Adding dictionary')
    model.add_dictionary(w10p_corp.dictionary)
    
    # Save model
    print(' Saving model')
    model.save(datapath+'/Models/2 GloVe/w10p_glove_vocab_batch{}.model'.format(bb+1))   



Processing batch 2 of 10
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
 Adding dictionary
 Saving model
Processing batch 3 of 10
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
 Adding dictionary
 Saving model
Processing batch 4 of 10
 Loading dictionary
 Fitting matrix
 Building GloVe model
Performing 25 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoc

In [12]:
model.add_dictionary(w10p_corp.dictionary)
    
    # Save model
print(' Saving model')
model.save(datapath+'/Models/2 GloVe/w10p_glove_vocab_batch{}.model'.format(0+1))   


 Saving model


In [11]:
# Original COO matrix
# model.most_similar('relationships', number=6)

[('banks', 0.9705410874528305),
 ('Eurovision', 0.9661675535912082),
 ('museums', 0.9645120915366855),
 ('São', 0.9575681776302074),
 ('Bar', 0.9561931570169429)]

In [28]:
# Without 0th row and column
#model.most_similar('relationships', number=6)

[('Chris+Thile', 0.2778770494212326),
 ('was+travelling', 0.23399237828058395),
 ('parliamentary', 0.23374034078911354),
 ('the+Way+of', 0.233284549898124),
 ('Gluttony', 0.22860027521751752)]

In [18]:
# COO matrix self-built, restricted vocab
#model.most_similar('relationships', number=6)

[('proteins', 0.9513887932507497),
 ('spots', 0.9428021658554301),
 ('daughter', 0.9409947349321203),
 ('Wilson', 0.9367265613197017),
 ('services', 0.9348989287751218)]

In [22]:
# COO matrix self-built, restricted vocab, 50 epochs
#model.most_similar('relationships', number=6)

[('macron', 0.9496588178319133),
 ('create', 0.9418371012697576),
 ('stretched', 0.9202446780377442),
 ('practical', 0.9135404567297558),
 ('thinks', 0.9135348377719998)]

In [13]:
# On w10p
model.most_similar('relationships', number=6)

[('hosted', 0.9178624078243787),
 ('feature+film', 0.9014592357080624),
 ('volume', 0.900332952816841),
 ('is+named', 0.8998926599437359),
 ('levels+of', 0.896896701434123)]

In [18]:
model.most_similar('large', number=6)

[('small', 0.9648814957316012),
 ('structure', 0.9326739554956462),
 ('distinct', 0.9306723332790285),
 ('complex', 0.9303893377218648),
 ('used', 0.9270431667358916)]

In [19]:
model.most_similar('great', number=6)

[('situation', 0.9312773860232666),
 ('much', 0.9310067969362218),
 ('resulting', 0.9255208933924305),
 ('own', 0.9235949948033259),
 ('real', 0.9225923293095144)]