In [1]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'E:/Dissertation Data'
    
os.chdir(path+'/Code')
os.getcwd()

'C:\\Users\\User\\Google Drive\\University\\Dissertation\\Code'

We collect lexical co-occurrence statistics on all words in
the English Wikipedia, using the WikiExtractor tool2 to retrieve
plain text from the April 2015 dump (ca. 2.8B words),
and using simple regular expressions to segment sentences
and words, and remove URLs and punctuation. We perform
no POS tagging, lemmatisation, case normalisation,
or removal of numbers or symbols.

In [2]:
import pickle
import pandas as pd
import re
import numpy as np

import nltk
from nltk.tokenize import MWETokenizer

In [3]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
    
from nltk.tokenize import WhitespaceTokenizer

In [4]:
w10p = PlaintextCorpusReader(datapath+'/Corpora/wiki/enwiki_20200520/','enwiki_20200520_10pc.txt',
                            word_tokenizer = WhitespaceTokenizer()
                            )

In [5]:
# Import word and sentence generators

from generators import sent_gen, word_gen, Sent_Seq

We collect word frequency information with the
SRILM language modelling toolkit (Stolcke, 2002), counting
n-grams (n <= 3), treating MWEs as contiguous bigrams
and trigrams), and identify MWE candidates by computing
the Poisson collocation measure (Quasthoff and Wolff,
2002) for all bigrams and trigrams (ca. 23M n-grams).
This method should be readily extensible to include longer
n-grams.

In [4]:
from redis import Redis
from rediscollections import RedisOrderedDict, key_val_tuples

In [7]:
from nltk.metrics.spearman import (
    spearman_correlation,
    ranks_from_scores,
)

In [5]:
eval_count = 500000

In [6]:
r = Redis()

od = RedisOrderedDict(r, 'w10p:w10p_poisson')

In [7]:
# Total no. of n-gram candidates (above minimum frequency)
len(od)

2002318

In [13]:
# Redis ordered dict contains Poisson scores for n-grams - pull requisite set from there

ngram_eval = pd.DataFrame.from_records( key_val_tuples(od, end=eval_count-1) , columns = ['ngram', 'poisson'])

In [14]:
ngram_eval['len'] = ngram_eval.ngram.apply(len)

In [15]:
ngram_eval

Unnamed: 0,ngram,poisson,len
0,"(of, the)",3.874652e+06,2
1,"(References, External, links)",2.566994e+06,3
2,"(External, links)",2.229096e+06,2
3,"(in, the)",2.094387e+06,2
4,"(0, 0, 0)",1.798530e+06,3
...,...,...,...
499995,"(Work, started, on)",3.883693e+02,3
499996,"(of, these, techniques)",3.883679e+02,3
499997,"(Stadio, Flaminio)",3.883678e+02,2
499998,"(IRE, 3, C)",3.883678e+02,3


In [5]:
#len(w10p.words())

305657697

We then automatically score the million most strongly associated
n-grams (i.e., roughly the top 5% of the Poisson-ranked
list) for compositionality.

Using word2vec (Mikolov et al., 2013) with the parameters
found to be most effective by Baroni et al. (2014), we
build a word embedding vector for every simplex word in
the vocabulary (ca. 1M types), as well as for each MWE candidate.

* Continuous bag of words model with 400-dimensional vectors, window size 5, subsampling with t = 10^-5, negative sampling with 10 samples. We build vectors only for tokens observed 20 times or more in the corpus.

We then compute the cosine similarity of the vector
representation for a MWE candidate with the vectors of its
constituent words, and take the arithmetic mean. 
In scoring
the compositionality of a candidate, we do not measure the
cosine similarity of the MWE with any stop words it may
contain, as stop words may be assumed to be semantically
uninformative.
* Stop words are taken here to be the 50 most frequent words in the vocabulary.

In [16]:
# Stopwords from corpus - 50 most frequent
from nltk import FreqDist

fdist = FreqDist(word_gen(w10p, sent_mark=''))

stop = set( word for word, f in fdist.most_common(50))

# Might be quicker to get these from the Redis hashdict (though it's not ordered) rather than counting

In [17]:
stop

{'0',
 '1',
 '2',
 '3',
 'A',
 'He',
 'In',
 'It',
 'New',
 'The',
 'a',
 'also',
 'an',
 'and',
 'are',
 'as',
 'at',
 'be',
 'been',
 'but',
 'by',
 'first',
 'for',
 'from',
 'had',
 'has',
 'have',
 'he',
 'her',
 'his',
 'in',
 'is',
 'it',
 'its',
 'not',
 'of',
 'on',
 'one',
 'or',
 'that',
 'the',
 'their',
 'this',
 'to',
 'two',
 'was',
 'were',
 'which',
 'who',
 'with'}

In [18]:
# Also pickle the stoplist, save counting again later
with open(datapath+'/Corpora/wiki/enwiki_20200520/10pc_stop.pkl', 'wb') as pfile:
    pickle.dump(stop, pfile)

In [19]:
del fdist

In [20]:
from batcher import batcher  # Custom module with logic for assigning n-grams to batches, avoiding overlap

In [21]:
batches, batch_count = batcher(ngram_eval.ngram, stopwords=stop, max_batches = 10)

batches

{('References', 'External', 'links'): 1,
 ('United', 'States'): 1,
 ('New', 'York'): 1,
 ('as', 'well', 'as'): 1,
 ('See', 'also'): 1,
 ('part', 'of', 'the'): 1,
 ('can', 'be'): 1,
 ('World', 'War', 'II'): 1,
 ('did', 'not'): 1,
 ('member', 'of', 'the'): 1,
 ('Los', 'Angeles'): 1,
 ('Kitt', 'Peak', 'Spacewatch'): 1,
 ('due', 'to'): 1,
 ('more', 'than'): 1,
 ('High', 'School'): 1,
 ('was', 'born', 'in'): 1,
 ('based', 'on'): 1,
 ('Apps', 'Goals', 'Apps'): 1,
 ('where', 'he'): 1,
 ('New', 'Zealand'): 1,
 ('during', 'the'): 1,
 ('a', 'number', 'of'): 1,
 ('would', 'be'): 1,
 ('List', 'of'): 1,
 ('University', 'of'): 1,
 ('end', 'of', 'the'): 1,
 ('According', 'to'): 1,
 ('Air', 'Force'): 1,
 ('United', 'Kingdom'): 1,
 ('along', 'with'): 1,
 ('known', 'as', 'the'): 1,
 ('does', 'not'): 1,
 ('Hong', 'Kong'): 1,
 ('New', 'South', 'Wales'): 1,
 ('There', 'are'): 1,
 ('Further', 'reading'): 1,
 ('was', 'released'): 1,
 ('Register', 'of', 'Historic'): 1,
 ('Prime', 'Minister'): 1,
 ('may', 'be'

In [22]:
len(batches)

500000

In [23]:
# Should be able to add batch information using df.map() but am encountering errors apparently relating
#  to indexing - workaround (though slower).

ngb_cols = ["ngram", "batch"]
rows = []

for ng in ngram_eval['ngram']:
    rows.append({"ngram" : ng,
                "batch" : batches[ng]})
    
ng_batch = pd.DataFrame(rows, columns = ngb_cols)

ng_batch

Unnamed: 0,ngram,batch
0,"(of, the)",-2
1,"(References, External, links)",1
2,"(External, links)",2
3,"(in, the)",-2
4,"(0, 0, 0)",-2
...,...,...
499995,"(Work, started, on)",8
499996,"(of, these, techniques)",8
499997,"(Stadio, Flaminio)",1
499998,"(IRE, 3, C)",1


In [24]:
ngram_eval = ngram_eval.merge(ng_batch, on='ngram')

ngram_eval

Unnamed: 0,ngram,poisson,len,batch
0,"(of, the)",3.874652e+06,2,-2
1,"(References, External, links)",2.566994e+06,3,1
2,"(External, links)",2.229096e+06,2,2
3,"(in, the)",2.094387e+06,2,-2
4,"(0, 0, 0)",1.798530e+06,3,-2
...,...,...,...,...
499995,"(Work, started, on)",3.883693e+02,3,8
499996,"(of, these, techniques)",3.883679e+02,3,8
499997,"(Stadio, Flaminio)",3.883678e+02,2,1
499998,"(IRE, 3, C)",3.883678e+02,3,1


In [25]:
ngram_eval.batch.value_counts()

 1     94011
 2     79538
 3     69221
 4     56154
 5     48934
 6     38288
 7     30374
-1     29076
 8     22495
 9     19272
 10    11300
-2      1337
Name: batch, dtype: int64

In [26]:
ngram_eval.to_pickle(datapath+'/Corpora/wiki/enwiki_20200520/10pc_ngram_eval.pkl')

In [12]:
ngram_eval = pd.read_pickle(datapath+'/Corpora/wiki/enwiki_20200520/10pc_ngram_eval.pkl')

In [8]:
ngram_all = pd.DataFrame.from_records( key_val_tuples(od) , columns = ['ngram', 'poisson'])

In [9]:
ngram_all

Unnamed: 0,ngram,poisson
0,"(of, the)",3.874652e+06
1,"(References, External, links)",2.566994e+06
2,"(External, links)",2.229096e+06
3,"(in, the)",2.094387e+06
4,"(0, 0, 0)",1.798530e+06
...,...,...
2002313,"(and, to)",-9.971941e+04
2002314,"(and, a)",-1.003225e+05
2002315,"(was, the)",-1.339807e+05
2002316,"(and, in)",-1.616960e+05


In [14]:
search = ('Clatsop', 'County', 'Oregon')

ngram_all[ngram_all.ngram == search]

Unnamed: 0,ngram,poisson
337618,"(Clatsop, County, Oregon)",558.754543


In [28]:
ngram_eval[ngram_eval.ngram == search]

Unnamed: 0,ngram,poisson,len,batch
337618,"(Clatsop, County, Oregon)",558.754543,3,1


In [22]:
from redisprob import RedisHashFreqDist
tg = RedisHashFreqDist(r, 'w10p:w10p_tg')

In [23]:
tg[search]

26

In [24]:
from gensim.models import Word2Vec

In [25]:
batch_model = Word2Vec.load(datapath+'/Models/1 w2v/wiki10pc_batch1.model')

In [27]:
'+'.join(search) in batch_model.wv.vocab

False