In [1]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'E:/Dissertation Data'

os.chdir(path+'/Code')
os.getcwd()

'C:\\Users\\User\\Google Drive\\University\\Dissertation\\Code'

In [2]:
import pandas as pd
import re
import numpy as np

import pickle
import nltk
from nltk.tokenize import MWETokenizer

from glove import Corpus, Glove
from gensim import corpora

In [3]:
# Load n-gram list produced earlier

ngram_eval = pd.read_pickle(datapath+'/Corpora/wiki/enwiki_20200520/10pc_ngram_eval.pkl')

In [4]:
with open(datapath+'/Corpora/wiki/enwiki_20200520/10pc_stop.pkl', 'rb') as pfile:
    stop = pickle.load(pfile)

In [5]:
stop

{'0',
 '1',
 '2',
 '3',
 'A',
 'He',
 'In',
 'It',
 'New',
 'The',
 'a',
 'also',
 'an',
 'and',
 'are',
 'as',
 'at',
 'be',
 'been',
 'but',
 'by',
 'first',
 'for',
 'from',
 'had',
 'has',
 'have',
 'he',
 'her',
 'his',
 'in',
 'is',
 'it',
 'its',
 'not',
 'of',
 'on',
 'one',
 'or',
 'that',
 'the',
 'their',
 'this',
 'to',
 'two',
 'was',
 'were',
 'which',
 'who',
 'with'}

In [6]:
min_freq = 20

In [7]:
ngram_eval

Unnamed: 0,ngram,poisson,len,batch
0,"(of, the)",3.874652e+06,2,-2
1,"(References, External, links)",2.566994e+06,3,1
2,"(External, links)",2.229096e+06,2,2
3,"(in, the)",2.094387e+06,2,-2
4,"(0, 0, 0)",1.798530e+06,3,-2
...,...,...,...,...
499995,"(Work, started, on)",3.883693e+02,3,8
499996,"(of, these, techniques)",3.883679e+02,3,8
499997,"(Stadio, Flaminio)",3.883678e+02,2,1
499998,"(IRE, 3, C)",3.883678e+02,3,1


In [8]:
batch_count = max(ngram_eval.batch)

In [9]:
ngram_eval.batch.value_counts()

 1     94011
 2     79538
 3     69221
 4     56154
 5     48934
 6     38288
 7     30374
-1     29076
 8     22495
 9     19272
 10    11300
-2      1337
Name: batch, dtype: int64

In [10]:
from nltk.tokenize import MWETokenizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Flatten down to a single number
def cosim(x,y):
    return cosine_similarity(x.reshape(1,-1), y.reshape(1,-1))[0][0]

In [11]:
def mwe_score(exp, model, stats_frame):
    # Combined token for MWE
    mwetoken = '+'.join(exp)

    # Stopwords - 1 if component is a stopword, 0 if present, -1 if simplex word missing from vocab, -2 if MWE missing
    sws = []
    # Component vectors
    cvs = []

    #  Neighbours in original & MWE-aware space
    oldn = []
    newn = []

    # List of individual word similarities (where present in the vocab)
    css = []

    # Empty array
    earr = np.empty(1000)
    earr[:] = np.nan

    # Check that combined token exists in the vocab. This protects against inflation of n-gram counts caused by repeats
    #  of the same token (e.g. in lists like https://simple.wikipedia.org/wiki/List_of_cities,_towns_and_villages_in_Fars_Province)
    if mwetoken in model.dictionary:

        mwv = model.word_vectors[model.dictionary[mwetoken]]

        for w in exp:
            if w in model.dictionary:
                cvs.append(model.word_vectors[model.dictionary[w]])

                #oldn.append(model.most_similar(w, number=5))

                if w in stop:
                    sws.append(1)
                    css.append(np.nan)
                else:
                    sws.append(0)
                    css.append(cosim(model.word_vectors[model.dictionary[w]], mwv ))

            # If component is absent from vocab
            else:
                sws.append(-1)
                cvs.append(earr)
                css.append(np.nan)

                #oldn.append([])

        #  Mean cosim
        if min(sws) >= 0:
            cs = np.nanmean(css)
        else:
            cs = np.nan

        #newn = model.most_similar(mwetoken, number=5)

    # Combined token missing from vocab - mark with defaults
    else:
        sws = [-2]
        mwv = np.empty(400)
        mwv[:] = np.nan


    # Append to stats df
    return stats_frame.append({
        'ngram'  : exp,
        'stopwords' : sws,
        'mwe_vector' : mwv,
        'component_vectors' : cvs,
        'component_cosims'  : css,
        'cosine_sim'  : cs,
        #'base_nearest': oldn,
        #'mwe_nearest' : newn,
    }, ignore_index=True)

In [12]:
def invert_dict(dic):
    return {value:key for (key, value) in dic.items()}

In [17]:
%%time

batch_dfs = {}

invert_dict = False

for bb in range(batch_count):
    print('Processing batch {} of {}'.format(bb+1,batch_count))
    
    # Subset DataFrame
    batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)
    
    # Initialise MWETokenizer
    batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')
    
    # Load model
    print('Loading GloVe model')
    
    batch_model = Glove.load(datapath+'/Models/2 GloVe/w10p_glove_vocab_batch{}.model'.format(bb+1))
    
    # Invert dictionary (due to error in model execution code, now fixed)
    if invert_dict:
        batch_dict = invert_dict(batch_model.dictionary)
        batch_model.add_dictionary(batch_dict)

    print('Gathering MWE stats')
     # For each MWE, evaluate stats. Record vectors (in case we want to calculate different metrics later).
    statsf = pd.DataFrame(columns=['ngram', 'stopwords', 'mwe_vector', 'component_vectors', 'component_cosims', 
                                   'cosine_sim'])  #, 'base_nearest', 'mwe_nearest'

    batch_len = len(batch_dfs[bb].ngram)
    if batch_len >= 5000: 
        printer = 2500
    else:
        printer = 200
        
    _i = 0
    
    for exp in batch_dfs[bb].ngram:
        if _i % printer == 0:
            print(' MWE '+str(_i)+'/'+str(batch_len)+': '+'+'.join(exp))
        _i += 1
            
        statsf = mwe_score(exp,batch_model,statsf)

      #Join back onto DataFrame
    batch_dfs[bb] = batch_dfs[bb].merge(statsf, on='ngram')    
    

Processing batch 1 of 10
Loading GloVe model
Gathering MWE stats
 MWE 0/94011: References+External+links
 MWE 2500/94011: noted+for
 MWE 5000/94011: 2016–17+season
 MWE 7500/94011: D'Oyly+Carte
 MWE 10000/94011: said+she
 MWE 12500/94011: Buddy+Holly
 MWE 15000/94011: Nuevo+Laredo
 MWE 17500/94011: an+organic+compound
 MWE 20000/94011: Campeonato+de+Portugal
 MWE 22500/94011: joined+together
 MWE 25000/94011: going+down
 MWE 27500/94011: describes+it
 MWE 30000/94011: was+convicted+in
 MWE 32500/94011: 2nd+Runner-up
 MWE 35000/94011: begin+work+on
 MWE 37500/94011: Bryce+Canyon
 MWE 40000/94011: Julien+Duvivier
 MWE 42500/94011: 23%+below+basic
 MWE 45000/94011: municipal+unit+has
 MWE 47500/94011: is+released+on
 MWE 50000/94011: Bobby+Hutcherson
 MWE 52500/94011: Boeing+777-300ER
 MWE 55000/94011: Little+Feat
 MWE 57500/94011: Oficial+911
 MWE 60000/94011: Multiple+Launch+Rocket
 MWE 62500/94011: and+in+spite
 MWE 65000/94011: Institutional+Investor
 MWE 67500/94011: A+Catalogue+of
 

In [14]:
# Removed
    #print('Gathering MWE stats')
    # For each MWE, evaluate stats. Record vectors (in case we want to calculate different metrics later).
    #statsf = pd.DataFrame(columns=['ngram', 'stopwords', 'mwe_vector', 'component_vectors', 'component_cosims', 
    #                               'cosine_sim', 'base_nearest', 'mwe_nearest'])

    #for exp in batch_dfs[bb].ngram:
    #    statsf = mwe_score(exp,batch_model,statsf)

    #  Join back onto DataFrame
    #batch_dfs[bb] = batch_dfs[bb].merge(statsf, on='ngram')

In [21]:
# Merge dataframes, sort by compositionality metric, export

# Also want the default batches with batch no < 0
all_batches = ngram_eval[ngram_eval.batch < 0].reindex(columns = batch_dfs[0].columns.tolist())

for d in range(batch_count):
    all_batches = all_batches.append(batch_dfs[d])
    
all_batches = all_batches.sort_values('cosine_sim')
all_batches = all_batches.reset_index(drop=True)

all_batches.to_csv(datapath+'/Models/2 GloVe/Results/w10p_vocab_output_001.csv', index=False)

In [22]:
all_batches

Unnamed: 0,ngram,poisson,len,batch,stopwords,mwe_vector,component_vectors,component_cosims,cosine_sim
0,"(at, the, FIL)",389.297974,3,4,"[1, 1, 0]","[-3.304669400496006, 3.2918497977969414, 3.289...","[[0.29045682541149237, -0.08268427520868048, 0...","[nan, nan, -0.9999266969605956]",-0.999927
1,"(brunt, of)",764.399850,2,3,"[0, 1]","[-6.011641172649969, -5.913132775700688, 5.929...","[[7.2772876167131235, 7.0261345448651396, -7.1...","[-0.9999125564125098, nan]",-0.999913
2,"(New, Zealand-born)",999.388134,2,1,"[1, 0]","[-5.285540019627866, 5.32779199475822, -5.4064...","[[-0.01064217782706564, 0.01147192576760911, -...","[nan, -0.9998874696606046]",-0.999887
3,"(of, Mecklenburg-Schwerin)",472.114015,2,6,"[1, 0]","[1.227232183250606, 1.3143013409835398, -1.235...","[[0.16670074192461656, -0.11178727846883854, -...","[nan, -0.999882841396625]",-0.999883
4,"(a, spinoff, of)",403.374558,3,1,"[1, 0, 1]","[0.6472155313911789, -0.6442973932991041, 0.62...","[[0.025192777637270344, 0.22246980769008778, -...","[nan, -0.9998615809834557, nan]",-0.999862
...,...,...,...,...,...,...,...,...,...
499995,"(Party, Comments/Suppleant, representatives)",514.988873,3,5,"[0, -1, 0]","[-0.001532781228780539, -0.0013564962182825934...","[[0.14885982741054793, 0.07894516720668883, -0...","[-0.06401353437456728, nan, -0.04209184800298231]",
499996,"(bottom:50px, top:10px)",454.154168,2,5,"[-1, 0]","[-4.248468422089702e-05, 0.0004327387824826704...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[nan, -0.14174566466780864]",
499997,"(color:eocene, text:Eocene)",420.145374,2,5,"[0, -1]","[0.000839802598036845, -0.0002937641294715939,...","[[0.0032681698690965907, -0.001653449966643513...","[0.010569120806779533, nan]",
499998,"(Nova, Scotia, TUNS)",587.673502,3,6,"[0, 0, -1]","[0.0006182197742030652, -0.000467253574371338,...","[[0.12940666025848813, 0.01258788292999845, -0...","[-0.007134790689687712, -0.02787587841584928, ...",


In [23]:
all_batches_light = all_batches.drop(columns=['mwe_vector', 'component_vectors'])

In [24]:
all_batches_light.to_csv(datapath+'/Models/2 GloVe/Results/w10p_vocab_light_001.csv', index=False)