In [1]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'C:/Users/'+os.getlogin()+'/Dissertation Data'

os.chdir(path+'/Code')
os.getcwd()

'C:\\Users\\tom\\Google Drive\\University\\Dissertation\\Code'

We collect lexical co-occurrence statistics on all words in
the English Wikipedia, using the WikiExtractor tool2 to retrieve
plain text from the April 2015 dump (ca. 2.8B words),
and using simple regular expressions to segment sentences
and words, and remove URLs and punctuation. We perform
no POS tagging, lemmatisation, case normalisation,
or removal of numbers or symbols.

In [2]:
import pandas as pd
import re
import numpy as np

import nltk
from nltk.tokenize import MWETokenizer

from glove import Corpus, Glove

from gensim.models import Word2Vec

In [3]:
# On Simple English wiki

sf = open(datapath+'/Corpora/wiki/simple_20200601/simple_sample.txt', 'r', encoding='utf-8')

for lines in range(5):
    print(sf.readline())

﻿April is the fourth month of the year, and comes between March and May. 

 It is one of four months to have 30 days. 

 April always begins on the same day of week as July, and additionally, January in leap years. 

 April always ends on the same day of the week as December. 

 April's flowers are the Sweet Pea and Daisy. 



In [4]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

from nltk.tokenize import WhitespaceTokenizer

In [5]:
simp = PlaintextCorpusReader(datapath+'/Corpora/wiki/simple_20200601/','simple_sample.txt',
                            word_tokenizer = WhitespaceTokenizer()
                            )

In [6]:
# Import word and sentence generators

from generators import sent_gen, word_gen, Sent_Seq

We collect word frequency information with the
SRILM language modelling toolkit (Stolcke, 2002), counting
n-grams (n <= 3), treating MWEs as contiguous bigrams
and trigrams), and identify MWE candidates by computing
the Poisson collocation measure (Quasthoff and Wolff,
2002) for all bigrams and trigrams (ca. 23M n-grams).
This method should be readily extensible to include longer
n-grams.

In [7]:
# Collate n-grams

from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder

from nltk.metrics import (
    BigramAssocMeasures,
    TrigramAssocMeasures,
    NgramAssocMeasures,
)

from nltk.metrics.spearman import (
    spearman_correlation,
    ranks_from_scores,
)

In [8]:
scorer = NgramAssocMeasures.poisson_stirling

tri_cf = TrigramCollocationFinder.from_words(word_gen(simp))
tri_cf.apply_word_filter(lambda w: w in ('|^|'))  # Filter out associations with sentence boundary marker

In [9]:
bi_cf = tri_cf.bigram_finder()                   # Make bigram finder from trigram, don't need to count again
bi_cf.apply_word_filter(lambda w: w in ('|^|'))  # Filter out associations with sentence boundary marker

In [10]:
bi_dict = {}
tri_dict = {}

for bigram in bi_cf.score_ngrams(scorer):    
    bi_dict[bigram[0]] = [bigram[0], bi_cf.ngram_fd[bigram[0]], bigram[-1]]
    
for trigram in tri_cf.score_ngrams(scorer):    
    tri_dict[trigram[0]] = [trigram[0], tri_cf.ngram_fd[trigram[0]], trigram[-1]]
    
tri_dict

{('0050d0',
  'Administrative',
  'subdivisions'): [('0050d0',
   'Administrative',
   'subdivisions'), 1, -17.030667136246944],
 ('1,319', 'Teppo', 'Numminen'): [('1,319', 'Teppo', 'Numminen'),
  1,
  -17.030667136246944],
 ('1125', '1219', '1.08'): [('1125', '1219', '1.08'), 1, -17.030667136246944],
 ('1273', '1599', '1.26'): [('1273', '1599', '1.26'), 1, -17.030667136246944],
 ('1282', '1590', '1.24'): [('1282', '1590', '1.24'), 1, -17.030667136246944],
 ('1294', '1327', '1.03'): [('1294', '1327', '1.03'), 1, -17.030667136246944],
 ('1432', '1335', '0.93'): [('1432', '1335', '0.93'), 1, -17.030667136246944],
 ('1487', '2857', '1.92'): [('1487', '2857', '1.92'), 1, -17.030667136246944],
 ('1502', 'Pedro', 'Alvares'): [('1502', 'Pedro', 'Alvares'),
  1,
  -17.030667136246944],
 ('1514', '1755', '1.16'): [('1514', '1755', '1.16'), 1, -17.030667136246944],
 ('1615', '1216', '0.75'): [('1615', '1216', '0.75'), 1, -17.030667136246944],
 ('1731', '1798', '1.04'): [('1731', '1798', '1.04'),

In [11]:
bigram_df = pd.DataFrame.from_dict(bi_dict, orient='index',
                       columns=['ngram', 'freq', 'poisson'])

trigram_df = pd.DataFrame.from_dict(tri_dict, orient='index',
                       columns=['ngram', 'freq', 'poisson'])

bigram_df

Unnamed: 0,ngram,freq,poisson
"($5.84, billion)","($5.84, billion)",1,-17.030667
"(0050d0, Administrative)","(0050d0, Administrative)",1,-17.030667
"(1,319, Teppo)","(1,319, Teppo)",1,-17.030667
"(100th, anniversary)","(100th, anniversary)",1,-17.030667
"(1125, 1219)","(1125, 1219)",1,-17.030667
...,...,...,...
"(New, York)","(New, York)",132,-3273.884672
"(from, the)","(from, the)",114,-3356.503611
"(is, a)","(is, a)",150,-4364.314853
"(in, the)","(in, the)",285,-8634.385922


In [12]:
ngram_df = bigram_df.append(trigram_df).sort_values('poisson', ascending=False).reset_index(drop=True)
ngram_df['len'] = ngram_df.ngram.apply(len)
ngram_df

Unnamed: 0,ngram,freq,poisson,len
0,"($5.84, billion)",1,-17.030667,2
1,"(fructose-sweetened, drinks)",1,-17.030667,2
2,"(gradual, deforestation)",1,-17.030667,2
3,"(grabs, Samuel's)",1,-17.030667,2
4,"(glittery, bracelets)",1,-17.030667,2
...,...,...,...,...
87590,"(New, York)",132,-3273.884672,2
87591,"(from, the)",114,-3356.503611,2
87592,"(is, a)",150,-4364.314853,2
87593,"(in, the)",285,-8634.385922,2


We then automatically score the million most strongly associated
n-grams (i.e., roughly the top 5% of the Poisson-ranked
list) for compositionality.

Using word2vec (Mikolov et al., 2013) with the parameters
found to be most effective by Baroni et al. (2014), we
build a word embedding vector for every simplex word in
the vocabulary (ca. 1M types), as well as for each MWE candidate.

* Continuous bag of words model with 400-dimensional vectors, window size 5, subsampling with t = 10^-5, negative sampling with 10 samples. We build vectors only for tokens observed 20 times or more in the corpus.

We then compute the cosine similarity of the vector
representation for a MWE candidate with the vectors of its
constituent words, and take the arithmetic mean. 
In scoring
the compositionality of a candidate, we do not measure the
cosine similarity of the MWE with any stop words it may
contain, as stop words may be assumed to be semantically
uninformative.
* Stop words are taken here to be the 50 most frequent words in the vocabulary.

In [13]:
# Stopwords from corpus - 50 most frequent
from nltk import FreqDist

fdist = FreqDist(word_gen(simp, sent_mark=''))

stop = set( word for word, f in fdist.most_common(20))

In [14]:
stop

{'April',
 'August',
 'New',
 'The',
 'a',
 'and',
 'are',
 'as',
 'by',
 'for',
 'from',
 'in',
 'is',
 'of',
 'on',
 'that',
 'the',
 'to',
 'was',
 'with'}

In [15]:
from batcher import batcher  # Custom module with logic for assigning n-grams to batches, avoiding overlap

In [16]:
min_freq = 10

In [17]:
# Duplicate entries appearing for some reason. Removing here
ngram_df2 = ngram_df[ngram_df.freq >= min_freq].drop_duplicates().reset_index(drop=True)

ngram_df2

Unnamed: 0,ngram,freq,poisson,len
0,"(Jari, Kurri)",10,-203.525952,2
1,"(W, L)",10,-209.941413,2
2,"(L, T)",10,-211.010565,2
3,"(Martin, Brodeur)",10,-214.900988,2
4,"(GP, W)",10,-214.966416,2
...,...,...,...,...
428,"(New, York)",132,-3273.884672,2
429,"(from, the)",114,-3356.503611,2
430,"(is, a)",150,-4364.314853,2
431,"(in, the)",285,-8634.385922,2


In [18]:
eval_count = 150000

ngram_eval = ngram_df2[0:eval_count]

ngram_eval

Unnamed: 0,ngram,freq,poisson,len
0,"(Jari, Kurri)",10,-203.525952,2
1,"(W, L)",10,-209.941413,2
2,"(L, T)",10,-211.010565,2
3,"(Martin, Brodeur)",10,-214.900988,2
4,"(GP, W)",10,-214.966416,2
...,...,...,...,...
428,"(New, York)",132,-3273.884672,2
429,"(from, the)",114,-3356.503611,2
430,"(is, a)",150,-4364.314853,2
431,"(in, the)",285,-8634.385922,2


In [19]:
# Clean up
del ngram_df, ngram_df2

In [20]:
batches, batch_count = batcher(ngram_eval.ngram, stopwords=stop, max_batches = 15)

batches

{('Jari', 'Kurri'): 1,
 ('W', 'L'): 1,
 ('Martin', 'Brodeur'): 1,
 ('Chris', 'Chelios'): 1,
 ('Mexican', 'Americans'): 1,
 ('1992–93', '1992–1993'): 1,
 ('Additional', 'voice'): 1,
 ('Mike', 'Bossy'): 1,
 ('Mike', 'Modano'): 1,
 ('acid', 'levels'): 1,
 ('Minnesota', 'Wild'): 1,
 ('Hammerhead', 'sharks'): 1,
 ('von', 'Braun'): 1,
 ('Toronto', 'Ontario'): 1,
 ('Camp', 'Rock'): 1,
 ('Mughal', 'Empire'): 1,
 ('Smythe', 'Division'): 1,
 ('Southeast', 'Division'): 1,
 ('Brendan', 'Shanahan'): 1,
 ('King', 'Saul'): 1,
 ('109', 'pts'): 1,
 ('television', 'series'): 1,
 ('North', 'America'): 1,
 ('Angeles', 'California'): 1,
 ('n/a', 'n/a'): 1,
 ('years', 'immediately'): 1,
 ('American', 'Bash'): 1,
 ('Mark', 'Thomas'): 1,
 ('Joe', 'Sakic'): 1,
 ('August', '27'): 1,
 ('April', '25'): 1,
 ('he', 'would'): 1,
 ('winners', 'produced'): 1,
 ('a', 'fictional'): 1,
 ('able', 'to'): 1,
 ('July', '2006'): 1,
 ('Packers', 'Quarterback'): 1,
 ('Gretzky', 'Edmonton'): 1,
 ('who', 'have'): 1,
 ('was', 'bor

In [21]:
len(batches)

433

In [22]:
# Should be able to add batch information using df.map() but am encountering errors apparently relating
#  to indexing - workaround (though slower).

ngb_cols = ["ngram", "batch"]
rows = []

for ng in ngram_eval['ngram']:
    rows.append({"ngram" : ng,
                "batch" : batches[ng]})
    
ng_batch = pd.DataFrame(rows, columns = ngb_cols)

ng_batch

Unnamed: 0,ngram,batch
0,"(Jari, Kurri)",1
1,"(W, L)",1
2,"(L, T)",2
3,"(Martin, Brodeur)",1
4,"(GP, W)",2
...,...,...
428,"(New, York)",5
429,"(from, the)",-2
430,"(is, a)",-2
431,"(in, the)",-2


In [23]:
ngram_eval = ngram_eval.merge(ng_batch, on='ngram')

ngram_eval

Unnamed: 0,ngram,freq,poisson,len,batch
0,"(Jari, Kurri)",10,-203.525952,2,1
1,"(W, L)",10,-209.941413,2,1
2,"(L, T)",10,-211.010565,2,2
3,"(Martin, Brodeur)",10,-214.900988,2,1
4,"(GP, W)",10,-214.966416,2,2
...,...,...,...,...,...
428,"(New, York)",132,-3273.884672,2,5
429,"(from, the)",114,-3356.503611,2,-2
430,"(is, a)",150,-4364.314853,2,-2
431,"(in, the)",285,-8634.385922,2,-2


In [24]:
ngram_eval.batch.value_counts()

 1    198
 2    104
 3     40
-2     32
 4     26
 5     19
 6     10
 7      4
Name: batch, dtype: int64

In [25]:
from nltk.tokenize import MWETokenizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Flatten down to a single number
def cosim(x,y):
    return cosine_similarity(x.reshape(1,-1), y.reshape(1,-1))[0][0]

In [26]:
def mwe_score(exp, model, stats_frame):
    # Combined token for MWE
    mwetoken = '+'.join(exp)

    # Stopwords - 1 if component is a stopword, 0 if present, -1 if simplex word missing from vocab, -2 if MWE missing
    sws = []
    # Component vectors
    cvs = []

    #  Neighbours in original & MWE-aware space
    oldn = []
    newn = []

    # List of individual word similarities (where present in the vocab)
    css = []

    # Empty array
    earr = np.empty(1000)
    earr[:] = np.nan

    # Check that combined token exists in the vocab. This protects against inflation of n-gram counts caused by repeats
    #  of the same token (e.g. in lists like https://simple.wikipedia.org/wiki/List_of_cities,_towns_and_villages_in_Fars_Province)
    if mwetoken in model.dictionary:

        mwv = model.word_vectors[model.dictionary[mwetoken]]

        for w in exp:
            if w in model.dictionary:
                cvs.append(model.word_vectors[model.dictionary[w]])

                oldn.append(model.most_similar(w, number=5))

                if w in stop:
                    sws.append(1)
                    css.append(np.nan)
                else:
                    sws.append(0)
                    css.append(cosim(model.word_vectors[model.dictionary[w]], mwv ))

            # If component is absent from vocab
            else:
                sws.append(-1)
                cvs.append(earr)
                css.append(np.nan)

                oldn.append([])

        #  Mean cosim
        if min(sws) >= 0:
            cs = np.nanmean(css)
        else:
            cs = np.nan

        newn = model.most_similar(mwetoken, number=5)

    # Combined token missing from vocab - mark with defaults
    else:
        sws = [-2]
        mwv = np.empty(400)
        mwv[:] = np.nan


    # Append to stats df
    return stats_frame.append({
        'ngram'  : exp,
        'stopwords' : sws,
        'mwe_vector' : mwv,
        'component_vectors' : cvs,
        'component_cosims'  : css,
        'cosine_sim'  : cs,
        'base_nearest': oldn,
        'mwe_nearest' : newn,
    }, ignore_index=True)

In [36]:
def mwe_score_par(args, np=np, pd=pd):
    exp, model = args
    
    # Combined token for MWE
    mwetoken = '+'.join(exp)

    # Stopwords - 1 if component is a stopword, 0 if present, -1 if simplex word missing from vocab, -2 if MWE missing
    sws = []
    # Component vectors
    cvs = []

    #  Neighbours in original & MWE-aware space
    oldn = []
    newn = []

    # List of individual word similarities (where present in the vocab)
    css = []

    # Empty array
    earr = np.empty(1000)
    earr[:] = np.nan

    # Check that combined token exists in the vocab. This protects against inflation of n-gram counts caused by repeats
    #  of the same token (e.g. in lists like https://simple.wikipedia.org/wiki/List_of_cities,_towns_and_villages_in_Fars_Province)
    if mwetoken in model.dictionary:

        mwv = model.word_vectors[model.dictionary[mwetoken]]

        for w in exp:
            if w in model.dictionary:
                cvs.append(model.word_vectors[model.dictionary[w]])

                oldn.append(model.most_similar(w, number=5))

                if w in stop:
                    sws.append(1)
                    css.append(np.nan)
                else:
                    sws.append(0)
                    css.append(cosim(model.word_vectors[model.dictionary[w]], mwv ))

            # If component is absent from vocab
            else:
                sws.append(-1)
                cvs.append(earr)
                css.append(np.nan)

                oldn.append([])

        #  Mean cosim
        if min(sws) >= 0:
            cs = np.nanmean(css)
        else:
            cs = np.nan

        newn = model.most_similar(mwetoken, number=5)

    # Combined token missing from vocab - mark with defaults
    else:
        sws = [-2]
        mwv = np.empty(400)
        mwv[:] = np.nan


    # Return stats df
    return pd.DataFrame.from_dict({
        'ngram'  : [exp],
        'stopwords' : [sws],
        'mwe_vector' : [mwv],
        'component_vectors' : [cvs],
        'component_cosims'  : [css],
        'cosine_sim'  : [cs],
        'base_nearest': [oldn],
        'mwe_nearest' : [newn],
    })

In [28]:
%%time

batch_dfs = {}

for bb in range(batch_count):
    print('Processing batch {} of {}'.format(bb+1,batch_count))
    # Subset DataFrame
    batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)
    
    # Initialise MWETokenizer
    batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')
    
    # Build model
    simp_corp = Corpus()

    sents_mwe = Sent_Seq(simp, batch_token_mwe)
    simp_corp.fit( sents_mwe , window = 10)
    
    batch_model = Glove(no_components = 300, 
             learning_rate = 0.05)
    
    batch_model.fit(simp_corp.matrix, 
          epochs=50,
          no_threads=16,
          verbose=False)

    batch_model.add_dictionary(simp_corp.dictionary)

    # Save model
    batch_model.save(datapath+'/Models/2 GloVe/simple_batch{}.model'.format(bb+1))
    # Reload looks like    new_model = Glove.load('glove.model')
    
    # For each MWE, evaluate stats. Record vectors (in case we want to calculate different metrics later).
    statsf = pd.DataFrame(columns=['ngram', 'stopwords', 'mwe_vector', 'component_vectors', 'component_cosims', 
                                   'cosine_sim', 'base_nearest', 'mwe_nearest'])

    for exp in batch_dfs[bb].ngram:
        statsf = mwe_score(exp,batch_model,statsf)

    #  Join back onto DataFrame
    batch_dfs[bb] = batch_dfs[bb].merge(statsf, on='ngram')
    
    
batch_dfs

Processing batch 1 of 7
Processing batch 2 of 7
Processing batch 3 of 7
Processing batch 4 of 7
Processing batch 5 of 7
Processing batch 6 of 7
Processing batch 7 of 7
Wall time: 1min 50s


{0:                     ngram  freq      poisson  len  batch  stopwords  \
 0           (Jari, Kurri)    10  -203.525952    2      1   [-1, -1]   
 1                  (W, L)    10  -209.941413    2      1     [0, 0]   
 2       (Martin, Brodeur)    10  -214.900988    2      1    [0, -1]   
 3        (Chris, Chelios)    10  -217.855546    2      1    [0, -1]   
 4    (Mexican, Americans)    10  -218.635572    2      1     [0, 0]   
 ..                    ...   ...          ...  ...    ...        ...   
 193              (is, an)    34  -976.359183    2      1     [1, 0]   
 194        (inducted, in)    38 -1035.764197    2      1     [0, 1]   
 195             (0, 0, 0)    33 -1072.003121    3      1  [0, 0, 0]   
 196      (United, States)    50 -1157.334956    2      1     [0, 0]   
 197          (the, first)    50 -1471.806387    2      1     [1, 0]   
 
                                             mwe_vector  \
 0    [-0.005231714711596772, 0.004657420091877666, ...   
 1    [0.0371

In [29]:
import multiprocessing as mp1
import multiprocess as mp

In [30]:
%%time

if __name__ == '__main__':
    batch_dfs = {}

    for bb in range(batch_count):
        print('Processing batch {} of {}'.format(bb+1,batch_count))
        # Subset DataFrame
        batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)

        # Initialise MWETokenizer
        batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')

        # Build model
        simp_corp = Corpus()

        sents_mwe = Sent_Seq(simp, batch_token_mwe)
        simp_corp.fit( sents_mwe , window = 10)

        batch_model = Glove(no_components = 300, 
                 learning_rate = 0.05)

        batch_model.fit(simp_corp.matrix, 
              epochs=50,
              no_threads=16,
              verbose=False)

        batch_model.add_dictionary(simp_corp.dictionary)

        # Save model
        batch_model.save(datapath+'/Models/2 GloVe/simple_batch{}.model'.format(bb+1))
        # Reload looks like    new_model = Glove.load('glove.model')

        # For each MWE, evaluate stats. Record vectors (in case we want to calculate different metrics later).
        # Parallelized version
        #with mp.Pool() as pool:
        #    statslist = pool.imap_unordered( mwe_score_par , [(ng, batch_model) for ng in batch_dfs[bb].ngram] )
         
        statslist = [mwe_score_par((ng, batch_model)) for ng in batch_dfs[bb].ngram]
        
        statsf = pd.concat(statslist)

        #  Join back onto DataFrame
        batch_dfs[bb] = batch_dfs[bb].merge(statsf, on='ngram')
    
    
batch_dfs

Processing batch 1 of 7
Processing batch 2 of 7
Processing batch 3 of 7
Processing batch 4 of 7
Processing batch 5 of 7
Processing batch 6 of 7
Processing batch 7 of 7
Wall time: 1min 48s


{0:                     ngram  freq      poisson  len  batch  stopwords  \
 0           (Jari, Kurri)    10  -203.525952    2      1   [-1, -1]   
 1                  (W, L)    10  -209.941413    2      1     [0, 0]   
 2       (Martin, Brodeur)    10  -214.900988    2      1    [0, -1]   
 3        (Chris, Chelios)    10  -217.855546    2      1    [0, -1]   
 4    (Mexican, Americans)    10  -218.635572    2      1     [0, 0]   
 ..                    ...   ...          ...  ...    ...        ...   
 193              (is, an)    34  -976.359183    2      1     [1, 0]   
 194        (inducted, in)    38 -1035.764197    2      1     [0, 1]   
 195             (0, 0, 0)    33 -1072.003121    3      1  [0, 0, 0]   
 196      (United, States)    50 -1157.334956    2      1     [0, 0]   
 197          (the, first)    50 -1471.806387    2      1     [1, 0]   
 
                                             mwe_vector  \
 0    [-0.00268128307697482, 0.0013802484932867673, ...   
 1    [-0.035

In [37]:
%%time



batch_dfs = {}

for bb in range(batch_count):
    print('Processing batch {} of {}'.format(bb+1,batch_count))
    # Subset DataFrame
    batch_dfs[bb] = ngram_eval[ngram_eval.batch == bb+1].reset_index(drop=True)

    # Initialise MWETokenizer
    batch_token_mwe = MWETokenizer(list(batch_dfs[bb].ngram) , separator='+')

    # Build model
    simp_corp = Corpus()

    sents_mwe = Sent_Seq(simp, batch_token_mwe)
    simp_corp.fit( sents_mwe , window = 10)

    batch_model = Glove(no_components = 300, 
             learning_rate = 0.05)

    batch_model.fit(simp_corp.matrix, 
          epochs=50,
          no_threads=16,
          verbose=False)

    batch_model.add_dictionary(simp_corp.dictionary)

    # Save model
    batch_model.save(datapath+'/Models/2 GloVe/simple_batch{}.model'.format(bb+1))
    # Reload looks like    new_model = Glove.load('glove.model')

    # For each MWE, evaluate stats. Record vectors (in case we want to calculate different metrics later).
    # Parallelized version
    with mp.Pool() as pool:
        statslist = pool.map( mwe_score_par , [(ng, batch_model) for ng in batch_dfs[bb].ngram] )

    # statslist = [mwe_score_par((ng, batch_model)) for ng in batch_dfs[bb].ngram]

    statsf = pd.concat(statslist)

    #  Join back onto DataFrame
    batch_dfs[bb] = batch_dfs[bb].merge(statsf, on='ngram')


batch_dfs

Processing batch 1 of 7


NameError: name 'stop' is not defined

In [None]:
# Model inspection
batch_model.most_similar('relationships', number=5)

In [None]:
# Merge dataframes, sort by compositionality metric, export

# Also want the default batches with batch no < 0
all_batches = ngram_eval[ngram_eval.batch < 0]

for d in range(batch_count):
    all_batches = all_batches.append(batch_dfs[d])
    
all_batches = all_batches.sort_values('cosine_sim')
all_batches = all_batches.reset_index(drop=True)

all_batches.to_csv(datapath+'/Models/2 GloVe/Results/simple_output_001.csv', index=False)

In [None]:
all_batches