"Gold standard" reference lists of MWEs, ranked for compositionality.
As used in Roberts & Egg (2018).

* __F_ENC__: Farahmand et al, 2015. __[farahmand_comp_data]__
 * 1042 noun compounds
 * 5 binary judgements -> Likert scale
* __R_ENC__: Reddy et al, 2011. __[reddy_comp_data]__
 * 90 noun compounds
 * Mechanical Turk, 6 binary judgements -> Likert
* __MC_VPC__: McCarthy et al 2003. __[mccarthy_comp_vpc]__
 * 117 verb-particle pairs
 * 3 judges, avergaed scores on 11-point scale.
* __D_ADJN__: Biemann and Giesbrecht. __[disco_2011]__
 * 68 Adj-NN compounds from (training and validation) for Disco 2011.
 * Mechanical Turk, 11-point scale.
* __MC_VN__: McCarthy et al 2007.  __[mccarthy_comp_vn]__
 * 638 verb-object pairs
 * 2 judges, 6-point scale.

In [4]:
import os

path = 'C:/Users/'+os.getlogin()+'/Google Drive/University/Dissertation'
datapath = 'C:/Users/'+os.getlogin()+'/Dissertation Data'
#datapath = 'E:/Dissertation Data'

os.chdir(path)
os.getcwd()

'C:\\Users\\tom\\Google Drive\\University\\Dissertation'

In [5]:
import pandas as pd
import numpy as np

In [6]:
def exp_to_tuple(exp,sep=' '):
    return tuple(exp.split(sep))

In [7]:
# Import Farahmand et al's list of compound nouns - 1042

F_ENC = pd.read_csv(path+'/Data/Gold Standards/en_ncs_noncompositional_conventionalized/instances_judgments/comb-judgements.csv')

F_ENC['noncomp_score'] = F_ENC[['noncomp_1', 'noncomp_2', 'noncomp_3', 'noncomp_4']].sum(axis=1)

F_ENC = F_ENC[['compound', 'noncomp_score']].sort_values('noncomp_score', ascending=False).rename(columns={'compound' : 'mwe', 'noncomp_score' : 'gs_score'}).reset_index(drop=True)

F_ENC['mwe'] = F_ENC.mwe.apply(exp_to_tuple)

F_ENC

Unnamed: 0,mwe,gs_score
0,"(academy, award)",4
1,"(hard, disk)",4
2,"(goose, bumps)",4
3,"(grass, roots)",4
4,"(greenhouse, gas)",4
...,...,...
1037,"(graduation, day)",0
1038,"(grain, requirements)",0
1039,"(grape, leaves)",0
1040,"(belief, system)",0


In [8]:
# Import Reddy et al's noun compounds

R_ENC = pd.read_csv(path+'/Data/Gold Standards/ijcnlp_compositionality_data/MeanAndDeviations.clean.txt',
                   sep = "\s+").reset_index()

def subber(s):
    return s[:-2]

R_ENC['mwe'] = R_ENC['index'].apply(subber) + ' ' + R_ENC['#word'].apply(subber)
R_ENC['gs_score'] = 5 - R_ENC.Cpd_mean

R_ENC = R_ENC[['mwe' , 'gs_score']].sort_values('gs_score', ascending = False).reset_index(drop=True)

R_ENC['mwe'] = R_ENC.mwe.apply(exp_to_tuple)

R_ENC

Unnamed: 0,mwe,gs_score
0,"(gravy, train)",4.689655
1,"(cloud, nine)",4.666667
2,"(ivory, tower)",4.535714
3,"(melting, pot)",4.461538
4,"(silver, bullet)",4.333333
...,...,...
85,"(speed, limit)",0.172414
86,"(swimming, pool)",0.133333
87,"(graduate, student)",0.100000
88,"(engine, room)",0.068966


In [9]:
# MC_VPC
MC_VPC1 = pd.read_csv(path+'/Data/Gold Standards/vpc_comp_mccarthy/Judge1',
                   sep = '\s*\:\s*', header = None, index_col=0,
                      na_values = '?', engine='python'
                     ).reset_index(drop=True).drop_duplicates()   # Duplicate entry for "look+up" - removed

MC_VPC1.columns= ['mwe', 'freq', 'comp1']
#MC_VPC1['comp1'] = MC_VPC1.comp1.astype(float)

MC_VPC2 = pd.read_csv(path+'/Data/Gold Standards/vpc_comp_mccarthy/Judge2',
                   sep = '\s*\:\s*', header = None, index_col=0,
                      na_values = '?', engine='python'
                     ).reset_index(drop=True).drop_duplicates()

MC_VPC2.columns= ['mwe', 'freq', 'comp2']

MC_VPC3 = pd.read_csv(path+'/Data/Gold Standards/vpc_comp_mccarthy/Judge3',
                   sep = '\s*\:\s*', header = None, index_col=0,
                      na_values = '?', engine='python'
                     ).reset_index(drop=True).drop_duplicates()

MC_VPC3.columns= ['mwe', 'freq', 'comp3']

MC_VPC = MC_VPC1.merge(MC_VPC2,on=['mwe', 'freq']).merge(MC_VPC3,on=['mwe', 'freq'])

MC_VPC['gs_score'] = 10 - MC_VPC[['comp1', 'comp2', 'comp3']].mean(axis=1, skipna=True)

MC_VPC = MC_VPC.sort_values('gs_score', ascending = False)[['mwe','gs_score']].reset_index(drop=True)

MC_VPC['mwe'] = MC_VPC.mwe.apply(exp_to_tuple, sep='+')

MC_VPC

Unnamed: 0,mwe,gs_score
0,"(cock, up)",9.333333
1,"(whip, off)",8.500000
2,"(space, out)",8.333333
3,"(stave, off)",8.333333
4,"(write, off)",8.333333
...,...,...
111,"(step, out)",1.333333
112,"(lie, down)",1.000000
113,"(pull, down)",0.333333
114,"(see, down)",0.000000


In [10]:
D_ADJN_test = pd.read_csv(path+'/Data/Gold Standards/DISCo 2011/english/num_scores/DISCo_num_EN_test.tsv',
                   sep = '\t', header = None
                     ).reset_index(drop=True)
D_ADJN_test.columns = ['class', 'mwe', 'gs_score']

# Only interested in ADJ_NN compounds
D_ADJN_test = D_ADJN_test[D_ADJN_test['class'] == 'EN_ADJ_NN'].drop('class', axis=1)



D_ADJN_train = pd.read_csv(path+'/Data/Gold Standards/DISCo 2011/english/num_scores/DISCo_num_EN_train.tsv',
                   sep = '\t', header = None
                     ).reset_index(drop=True)
D_ADJN_train.columns = ['class', 'mwe', 'gs_score']

# Only interested in ADJ_NN compounds
D_ADJN_train = D_ADJN_train[D_ADJN_train['class'] == 'EN_ADJ_NN'].drop('class', axis=1)


D_ADJN = D_ADJN_train.append(D_ADJN_test)

D_ADJN['gs_score'] = 100 - D_ADJN.gs_score

D_ADJN = D_ADJN.sort_values('gs_score', ascending = False).reset_index(drop=True)

D_ADJN['mwe'] = D_ADJN.mwe.apply(exp_to_tuple)

D_ADJN

Unnamed: 0,mwe,gs_score
0,"(blue, chip)",89
1,"(red, tape)",89
2,"(second, hand)",86
3,"(right, wing)",84
4,"(smart, card)",82
...,...,...
130,"(short, distance)",3
131,"(early, version)",2
132,"(small, island)",2
133,"(olive, oil)",1


In [11]:
MC_VN = pd.read_csv(path+'/Data/Gold Standards/emnlp2007data.txt',
                   sep = '\t', header = None
                     ).reset_index(drop=True)
MC_VN.columns = ['mwe', 'judge1', 'judge2']

MC_VN['gs_score'] = 6- MC_VN[['judge1', 'judge2']].mean(axis=1)

MC_VN = MC_VN[['mwe','gs_score']].sort_values('gs_score', ascending=False).reset_index(drop=True)

MC_VN['mwe'] = MC_VN.mwe.apply(exp_to_tuple)

MC_VN

Unnamed: 0,mwe,gs_score
0,"(have, heart)",5.0
1,"(catch, eye)",5.0
2,"(take, step)",5.0
3,"(take, root)",5.0
4,"(leave, mark)",5.0
...,...,...
633,"(eat, food)",0.0
634,"(pay, amount)",0.0
635,"(buy, house)",0.0
636,"(discuss, problem)",0.0


In [12]:
# R&E's own outputs

robegg = open(path+'/Data/RobertsEgg/MWE_en-filtered.utf8.txt', 'r', encoding='utf-8')

for lines in range(10):
    print(robegg.readline())

compo	assoc	mwe	words

-2	36.4506646641	")− 1

-2	28.826415177	"— weiler

-2	22.6054361674	"—&# X200B

-2	22.8166041032	$11.6 million

-2	27.2593711387	$23.5 million

-2	24.1214802321	$24.5 million

-2	28.2280390416	$25.5 million

-2	24.394108659	$28.5 million

-2	22.9419298938	$297.00. Poços



In [13]:
with open(path+'/Data/RobertsEgg/MWE_en-unfiltered.utf8.txt', 'r', encoding='utf-8') as robegg:
    robegg_df = pd.read_csv(robegg, names=['compo', 'assoc', 'mwe', 'w1', 'w2', 'w3'], 
                              delimiter='\t', na_values = ['--'], quoting=3, skiprows=1, skipinitialspace=True)

# Remove negative compositionality scores - R&E's filter
#robegg_df = robegg_df[robegg_df.compo >= -1].reset_index(drop=True)

robegg_df['mwe'] = robegg_df.mwe.apply(exp_to_tuple)

robegg_df

Unnamed: 0,compo,assoc,mwe,w1,w2,w3
0,-0.216884,452.575930,"(this, individual, as)",,-0.216884,
1,-0.177191,1291.822316,"(that, in, humans)",,,-0.177191
2,-0.161933,191.902424,"(individual, as, the)",-0.161933,,
3,-0.160100,49.378076,"(It, returned, one)",,-0.160100,
4,-0.144707,48.891020,"(had, two, more)",,,-0.144707
...,...,...,...,...,...,...
917642,0.941358,407.051275,"(Avenue, and)",0.941358,,
917643,0.944386,3105.318760,"(whether, or, not)",0.944386,,
917644,0.946745,143.658118,"(Records, and)",0.946745,,
917645,0.946882,67.013677,"(މ, ަ)",0.936615,0.957148,


In [14]:
from scipy.stats import spearmanr

def spmr(a,b):
    return spearmanr(a,b,nan_policy='omit')[0]

In [15]:
def lowtup(intup):
    return tuple(str.lower(t) for t in intup)

def corr_with_gs(inframe, scorecol, gsframe, refscore='gs_score', tiebreak = 'assoc', lowcase = False, mult=-1):
    '''
    When duplicates occur, retain record with largest value of tiebreak
    '''
    
    if str(inframe) == str(gsframe) and scorecol == refscore:
        return (len(inframe), 1.0, 1.0)
    
    if lowcase:
            _intersection = inframe[['mwe', scorecol, tiebreak]].merge(gsframe[['mwe', refscore]], 
                                             left_on=inframe.mwe.apply(lowtup), 
                                             right_on = gsframe.mwe.apply(lowtup), 
                                             how='inner')
    else:
        _intersection = inframe[['mwe', scorecol, tiebreak]].merge(gsframe[['mwe', refscore]], 
                                             left_on=inframe.mwe, 
                                             right_on = gsframe.mwe, 
                                             how='inner')
    
    _intersection = _intersection.sort_values(['key_0', tiebreak], ascending = [True, False]).drop_duplicates(subset=['key_0']).reset_index(drop=True)
    
    _overlap = len(_intersection)
    if _overlap:
        c_pears = _intersection[scorecol].corr(_intersection[refscore], method="pearson")
        c_spear = _intersection[scorecol].corr(_intersection[refscore], method="spearman")
    else:
        c_pears = np.nan
        c_spear = np.nan
        
    return (_overlap, mult*c_spear, mult*c_pears)

In [16]:
def corr_df(inframe, scorecol, gsframe, refscore='gs_score', tiebreak = 'assoc', lowcase = False):
    '''
    When duplicates occur, retain record with largest value of tiebreak
    '''
    
    if str(inframe) == str(gsframe) and scorecol == refscore:
        return (len(inframe), 1.0, 1.0)
    
    if lowcase:
            _intersection = inframe[['mwe', scorecol, tiebreak]].merge(gsframe[['mwe', refscore]], 
                                             left_on=inframe.mwe.apply(lowtup), 
                                             right_on = gsframe.mwe.apply(lowtup), 
                                             how='inner')
    else:
        _intersection = inframe[['mwe', scorecol, tiebreak]].merge(gsframe[['mwe', refscore]], 
                                             left_on=inframe.mwe, 
                                             right_on = gsframe.mwe, 
                                             how='inner')
    
    _intersection = _intersection.sort_values(['key_0', tiebreak], ascending = [True, False]).drop_duplicates(subset=['key_0'])
           
    return _intersection

In [17]:
corr_with_gs(robegg_df,'compo', F_ENC)

(631, 0.45766406237623103, 0.47314644528078603)

In [71]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1)
          }

In [72]:
RE_results = {}

for gs in gs_dict.keys():
    RE_results[gs] = corr_with_gs(robegg_df,'compo', gs_dict[gs][0], refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
RE_results

{'F_ENC': (631, 0.45766406237623103, 0.47314644528078603),
 'R_ENC': (61, 0.6148618326919655, 0.6030799131312391),
 'MC_VPC': (47, 0.4244468935228092, 0.3720121808366054),
 'D_ADJN': (118, 0.5282258091736578, 0.6049064597349988),
 'MC_VN': (132, 0.39246363536118933, 0.39533955699893253),
 'RE_WIKI15': (917647, 1.0, 1.0)}

In [18]:
# Import my results. Converter interprets the strings in 'ngram' as tuples.

converter = {'ngram' : eval,
             #'stopwords' : eval
             #'mwe_vector' : eval,
             #'component_vectors' : eval,
             #'base_nearest' : eval,
             #'mwe_nearest' : eval,
            }

simp_res = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/simp_tagged_light_001.csv', 
                       converters= converter
                      )

In [19]:
indexer = 14300
simp_res[indexer:indexer+20]

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim
14300,"(of|IN, English|NNP, football|NN)",31,-2133.134193,3,5,"[1, 0, 0]","[nan, 0.12516817, 0.35590157]",0.240535
14301,"(radio|NN, voice|NN)",29,-1310.676359,2,2,"[0, 0]","[0.17999308, 0.30109388]",0.240543
14302,"(have|VBP, the|DT, name|NN)",30,-2132.885827,3,1,"[0, 1, 0]","[0.31752214, nan, 0.16357005]",0.240546
14303,"(who|WP, became|VBD, famous|JJ)",21,-1357.951288,3,6,"[0, 0, 0]","[0.24747948, 0.1341637, 0.3400316]",0.240558
14304,"(of|IN, the|DT, 1994|CD)",20,-1505.049214,3,1,"[1, 1, 0]","[nan, nan, 0.2405645]",0.240564
14305,"(no|DT, one|NN, was|VBD)",25,-1609.34762,3,2,"[0, 0, 1]","[0.23164734, 0.24948514, nan]",0.240566
14306,"(82|CD, American|JJ)",38,-1758.990197,2,5,"[0, 0]","[0.12862773, 0.3525061]",0.240567
14307,"(part|NN, of|IN, any|DT)",37,-2533.766297,3,6,"[0, 1, 0]","[0.22932333, nan, 0.25184056]",0.240582
14308,"(in|IN, France|NNP, at|IN)",21,-1498.941415,3,4,"[1, 0, 1]","[nan, 0.24059233, nan]",0.240592
14309,"(Some|DT, areas|NNS)",24,-1136.364848,2,1,"[0, 0]","[0.23022379, 0.2509852]",0.240604


In [20]:
# Remove POS tags from tuples for matching

import re

pipematcher = re.compile("(.*)\|")
pipematch2 = re.compile("\|([A-Z]{2,4}\$?|[\$\:,\.\"]|``|\-LRB\-?|\-RRB\-?)")

def g1(matchobj):
    return matchobj.group(1)

def tup_matcher(tupstr, pattern=pipematcher):
    ot = []
    for w in tupstr:
        if w == '|HYPH': w = '-|HYPH'
        if re.match(pattern, w):
            ot.append(g1(pattern.match(w)))
    return tuple(ot)


def tup_rem(tupstr, pattern=pipematch2):
    ot = []
    for w in tupstr:
        if w == '|HYPH': w = '-|HYPH'
        ot.append(re.sub(pattern,'',w))
    return tuple(ot)

testtup = simp_res.ngram[149878]

tup_matcher(testtup)

('-', 'Normandie')

In [21]:
# Pipe, followed by 2-4 capitals and maybe a $ OR by specific punctuation marks OR by the tags for parentheses
pipematch2 = re.compile("\|([A-Z]{2,4}\$?|[\$\:,\.\"]|``|\-LRB\-?|\-RRB\-?)")

testtup = simp_res.ngram[14319]

tup_rem(testtup)

('then', '-', 'President')

In [22]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1)
          }

In [23]:
simp_res['ngram_tag'] = simp_res.ngram
simp_res['ngram'] = simp_res.ngram_tag.apply(tup_rem)

simp_res

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2119.509496,3,8,"[1, 1, 0]","[nan, nan, -0.21604778]",-0.216048,"(was|VBD, born|VBN, here|RB)"
1,"(s, then, -)",20,-1419.198013,3,4,"[1, 0, 1]","[nan, -0.15213177, nan]",-0.152132,"(s|POS, then|RB, |HYPH)"
2,"(this, time, for)",20,-1389.097772,3,9,"[0, 0, 1]","[-0.15749776, -0.13361101, nan]",-0.145554,"(this|DT, time|NN, for|IN)"
3,"(other, two, being)",28,-1831.298757,3,10,"[0, 0, 0]","[-0.13134842, -0.09783523, -0.19934192]",-0.142842,"(other|JJ, two|CD, being|VBG)"
4,"(In, This)",26,-1380.814735,2,7,"[1, 0]","[nan, -0.13744499]",-0.137445,"(In|IN, This|DT)"
...,...,...,...,...,...,...,...,...,...
149995,"(%, 7, %)",20,-1332.973488,3,13,[-2],[],,"(%|NN, 7|CD, %|NN)"
149996,"(the, Were, -)",22,-1422.855539,3,14,"[1, -1, 1]","[nan, nan, nan]",,"(the|DT, Were|VBN, |HYPH)"
149997,"(of, Schwarzburg, -)",24,-1551.293123,3,14,"[1, -1, 1]","[nan, nan, nan]",,"(of|IN, Schwarzburg|NNP, |HYPH)"
149998,"(the, Nanboku, -)",33,-2145.863245,3,14,"[1, -1, 1]","[nan, nan, nan]",,"(the|DT, Nanboku|NNP, |HYPH)"


In [24]:
simp_res[simp_res.ngram == ('Ving', 'Rhames')]

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
144663,"(Ving, Rhames)",20,-606.668931,2,1,"[-1, -1]","[nan, nan]",,"(Ving|NNP, Rhames|NNP)"


In [25]:
simp_res = simp_res[~np.isnan(simp_res.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe'}).reset_index(drop=True)

simp_res

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2119.509496,3,8,"[1, 1, 0]","[nan, nan, -0.21604778]",-0.216048,"(was|VBD, born|VBN, here|RB)"
1,"(s, then, -)",20,-1419.198013,3,4,"[1, 0, 1]","[nan, -0.15213177, nan]",-0.152132,"(s|POS, then|RB, |HYPH)"
2,"(this, time, for)",20,-1389.097772,3,9,"[0, 0, 1]","[-0.15749776, -0.13361101, nan]",-0.145554,"(this|DT, time|NN, for|IN)"
3,"(other, two, being)",28,-1831.298757,3,10,"[0, 0, 0]","[-0.13134842, -0.09783523, -0.19934192]",-0.142842,"(other|JJ, two|CD, being|VBG)"
4,"(In, This)",26,-1380.814735,2,7,"[1, 0]","[nan, -0.13744499]",-0.137445,"(In|IN, This|DT)"
...,...,...,...,...,...,...,...,...,...
143642,"(SSE4, 1)",32,-1397.623791,2,1,"[0, 1]","[0.980065, nan]",0.980065,"(SSE4|NN, 1|CD)"
143643,"(-, Lapu)",22,-994.898375,2,1,"[1, 0]","[nan, 0.98205227]",0.982052,"(|HYPH, Lapu|NNP)"
143644,"(-, Marnand)",24,-1085.279502,2,1,"[1, 0]","[nan, 0.98449343]",0.984493,"(|HYPH, Marnand|NNP)"
143645,"(WIN, WIN)",70,-2402.440072,2,6,"[0, 0]","[0.9861782, 0.9861782]",0.986178,"(WIN|VB, WIN|VB)"


In [26]:
_intersection = simp_res[['mwe', 'cosine_sim']].merge(F_ENC[['mwe', 'gs_score']], on='mwe', how='inner')

_intersection

Unnamed: 0,mwe,cosine_sim,gs_score
0,"(business, man)",0.093663,0
1,"(zip, code)",0.102839,4
2,"(death, row)",0.115153,4
3,"(death, notice)",0.127828,0
4,"(command, line)",0.150381,3
...,...,...,...
173,"(touch, screen)",0.678733,1
174,"(tape, recorder)",0.682743,0
175,"(bony, fish)",0.688756,0
176,"(particle, physics)",0.699622,0


In [27]:
simp_results = {}

for gs in gs_dict.keys():
    simp_results[gs] = corr_with_gs(simp_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
simp_results

{'F_ENC': (175, 0.22902421605697812, 0.2423163614162586),
 'R_ENC': (11, 0.4100238428203847, 0.396349198913055),
 'MC_VPC': (14, 0.11245927007520762, 0.10926147730010077),
 'D_ADJN': (33, 0.26876149464489874, 0.30397591337092333),
 'MC_VN': (35, 0.3390869220985702, 0.3611062679818502),
 'RE_WIKI15': (74710, 0.5334823992028059, 0.5492710552900145)}

In [28]:
simp_low_results = {}

for gs in gs_dict.keys():
    simp_low_results[gs] = corr_with_gs(simp_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
simp_low_results

{'F_ENC': (222, 0.13521972409019778, 0.11963828494872672),
 'R_ENC': (16, 0.25184121568510415, 0.2043730132921946),
 'MC_VPC': (15, 0.16337759414053082, 0.06825241384104669),
 'D_ADJN': (48, 0.1943313383568205, 0.22798529053510802),
 'MC_VN': (37, 0.352470735408605, 0.37163346288397714),
 'RE_WIKI15': (75734, 0.50623876520901, 0.5227199971813515)}

In [29]:
# 10% Wiki results - w2v

In [30]:
w10p_w2v = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/w10p_tagged_light_001.csv', 
                       converters= converter
                      )

In [31]:
w10p_w2v['ngram_tag'] = w10p_w2v.ngram
w10p_w2v['ngram'] = w10p_w2v.ngram_tag.apply(tup_rem)

w10p_w2v

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(delivered, from)",28,-1567.428918,2,5,"[0, 1]","[-0.2535712, nan]",-0.253571,"(delivered|VBD, from|IN)"
1,"(then, had, two)",20,-1586.812073,3,3,"[0, 1, 1]","[-0.24625674, nan, nan]",-0.246257,"(then|RB, had|VBD, two|CD)"
2,"(and, finally, for)",20,-1647.756550,3,4,"[1, 0, 1]","[nan, -0.22293654, nan]",-0.222937,"(and|CC, finally|RB, for|IN)"
3,"(one, before, it)",20,-1587.727730,3,7,"[1, 0, 1]","[nan, -0.21940775, nan]",-0.219408,"(one|CD, before|IN, it|PRP)"
4,"(However, in, their)",20,-1642.620348,3,3,"[0, 1, 1]","[-0.2108197, nan, nan]",-0.210820,"(However|RB, in|IN, their|PRP$)"
...,...,...,...,...,...,...,...,...,...
499995,"(the, Barebones, Parliament)",23,-1552.893367,3,10,"[1, -1, 0]","[nan, nan, 0.38508964]",,"(the|DT, Barebones|NNPS, Parliament|NNP)"
499996,"(2nd, 2nd, 2nd)",24,-1598.434969,3,10,[-2],[],,"(2nd|JJ, 2nd|JJ, 2nd|JJ)"
499997,"(was, 49,445, in)",22,-1600.048879,3,10,"[1, -1, 1]","[nan, nan, nan]",,"(was|VBD, 49,445|CD, in|IN)"
499998,"(a, modicum, of)",22,-1637.924770,3,10,"[1, -1, 1]","[nan, nan, nan]",,"(a|DT, modicum|NN, of|IN)"


In [32]:
w10p_w2v = w10p_w2v[~np.isnan(w10p_w2v.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe', 'cosine_sim' : 'cosim_10w'}).reset_index(drop=True)

In [33]:
w10p_w2v

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosim_10w,ngram_tag
0,"(delivered, from)",28,-1567.428918,2,5,"[0, 1]","[-0.2535712, nan]",-0.253571,"(delivered|VBD, from|IN)"
1,"(then, had, two)",20,-1586.812073,3,3,"[0, 1, 1]","[-0.24625674, nan, nan]",-0.246257,"(then|RB, had|VBD, two|CD)"
2,"(and, finally, for)",20,-1647.756550,3,4,"[1, 0, 1]","[nan, -0.22293654, nan]",-0.222937,"(and|CC, finally|RB, for|IN)"
3,"(one, before, it)",20,-1587.727730,3,7,"[1, 0, 1]","[nan, -0.21940775, nan]",-0.219408,"(one|CD, before|IN, it|PRP)"
4,"(However, in, their)",20,-1642.620348,3,3,"[0, 1, 1]","[-0.2108197, nan, nan]",-0.210820,"(However|RB, in|IN, their|PRP$)"
...,...,...,...,...,...,...,...,...,...
482425,"(środek.svgFile, Wygaszony, środek.svgFile)",36,-1588.358559,3,1,"[0, 0, 0]","[0.9717464, 0.9375481, 0.9717464]",0.960347,"(środek.svgFile|NNP, Wygaszony|NNP, środek.svg..."
482426,"(Recurvirostridae, is)",25,-1276.332579,2,4,"[0, 1]","[0.96257687, nan]",0.962577,"(Recurvirostridae|NNP, is|VBZ)"
482427,"(OUL3, CRO1, CRO2)",36,-1515.104871,3,2,"[0, 0, 0]","[0.94491136, 0.97224206, 0.97286904]",0.963341,"(OUL3|NN, CRO1|NN, CRO2|NN)"
482428,"(Lapu, -)",22,-1127.048317,2,2,"[0, 1]","[0.9648907, nan]",0.964891,"(Lapu|NNP, |HYPH)"


In [34]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1),
           'W2V_SIMP20': (simp_res,'cosine_sim',1),
          }

In [35]:
w10p_w2v_results = {}

for gs in gs_dict.keys():
    w10p_w2v_results[gs] = corr_with_gs(w10p_w2v,'cosim_10w', gs_dict[gs][0], tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
w10p_w2v_results

{'F_ENC': (77, 0.3338230676249727, 0.34934247926095696),
 'R_ENC': (10, 0.29696969696969694, 0.07717666068699167),
 'MC_VPC': (32, 0.09080921630368811, 0.10787230663222218),
 'D_ADJN': (9, 0.3025316904537665, 0.4184102581236103),
 'MC_VN': (49, 0.22921891575582556, 0.23310807082891202),
 'RE_WIKI15': (111526, 0.5975461349977123, 0.6042986100879869),
 'W2V_SIMP20': (7230, 0.6237011383627981, 0.6350345458432904)}

In [36]:
w10p_w2v_low_results = {}

for gs in gs_dict.keys():
    w10p_w2v_low_results[gs] = corr_with_gs(w10p_w2v,'cosim_10w', gs_dict[gs][0], tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
w10p_w2v_low_results

{'F_ENC': (225, 0.10189399329703026, 0.10292981690017042),
 'R_ENC': (33, 0.08777797459331552, 0.10858430253728699),
 'MC_VPC': (40, 0.009219227683774676, 0.04929705380217415),
 'D_ADJN': (47, 0.15577280705314026, 0.18893560425713435),
 'MC_VN': (59, 0.0825885665715718, 0.07898790495746937),
 'RE_WIKI15': (124553, 0.54478305802168, 0.5447835857143559),
 'W2V_SIMP20': (16185, 0.5090069998411715, 0.5208077498480876)}

## Results with cleansed tagged corpora
Some tokens (e.g. hyphens) removed / recombined

Also filtered candidate list to remove some low-quality items

In [37]:
converter = {'ngram' : eval,
             #'stopwords' : eval
             #'mwe_vector' : eval,
             #'component_vectors' : eval,
             #'base_nearest' : eval,
             #'mwe_nearest' : eval,
            }

simp_cln_res = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/simp_tagged_clean_light_001.csv', 
                       converters= converter
                      )

In [38]:
simp_cln_res['ngram_tag'] = simp_cln_res.ngram
simp_cln_res['ngram'] = simp_cln_res.ngram_tag.apply(tup_rem)

simp_cln_res

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2115.954533,3,7,"[1, 1, 0]","[nan, nan, -0.26227063]",-0.262271,"(was|VBD, born|VBN, here|RB)"
1,"(to, found)",54,-2767.761057,2,7,"[1, 0]","[nan, -0.16069299]",-0.160693,"(to|IN, found|VBN)"
2,"(:, end)",23,-1078.890289,2,2,"[1, 0]","[nan, -0.15938467]",-0.159385,"(:|:, end|VB)"
3,"(born, in, then)",26,-1826.089851,3,3,"[1, 1, 0]","[nan, nan, -0.15382023]",-0.153820,"(born|VBN, in|IN, then|RB)"
4,"(is, both, the)",35,-2521.230138,3,11,"[1, 0, 1]","[nan, -0.14895083, nan]",-0.148951,"(is|VBZ, both|CC, the|DT)"
...,...,...,...,...,...,...,...,...,...
149995,"(of, Chaffin's, Farm)",58,-3167.096834,3,9,"[1, -1, 0]","[nan, nan, 0.23580538]",,"(of|IN, Chaffin|NNP's, Farm|NNP)"
149996,"(NHL, Plus/Minus, Award)",38,-1973.585423,3,10,"[0, -1, 0]","[0.14594129, nan, 0.28706568]",,"(NHL|NNP, Plus|NNP/Minus|NNP, Award|NNP)"
149997,"(the, Goblet, of)",40,-2656.494897,3,10,"[1, -1, 1]","[nan, nan, nan]",,"(the|DT, Goblet|NNP, of|IN)"
149998,"(state, state, state)",48,-2965.054834,3,10,[-2],[],,"(state|NN, state|NN, state|NN)"


In [39]:
simp_cln_res = simp_cln_res[~np.isnan(simp_cln_res.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe'}).reset_index(drop=True)

simp_cln_res

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2115.954533,3,7,"[1, 1, 0]","[nan, nan, -0.26227063]",-0.262271,"(was|VBD, born|VBN, here|RB)"
1,"(to, found)",54,-2767.761057,2,7,"[1, 0]","[nan, -0.16069299]",-0.160693,"(to|IN, found|VBN)"
2,"(:, end)",23,-1078.890289,2,2,"[1, 0]","[nan, -0.15938467]",-0.159385,"(:|:, end|VB)"
3,"(born, in, then)",26,-1826.089851,3,3,"[1, 1, 0]","[nan, nan, -0.15382023]",-0.153820,"(born|VBN, in|IN, then|RB)"
4,"(is, both, the)",35,-2521.230138,3,11,"[1, 0, 1]","[nan, -0.14895083, nan]",-0.148951,"(is|VBZ, both|CC, the|DT)"
...,...,...,...,...,...,...,...,...,...
143746,"(GBR, GER)",31,-1020.735228,2,4,"[0, 0]","[0.9553108, 0.9801254]",0.967718,"(GBR|NN, GER|NN)"
143747,"(3, -0)",61,-2648.449939,2,2,"[1, 0]","[nan, 0.96958727]",0.969587,"(3|CD, -0|SYM)"
143748,"(Maerz, and)",49,-2302.220234,2,3,"[0, 1]","[0.97895634, nan]",0.978956,"(Maerz|NNP, and|CC)"
143749,"(WIN, WIN)",54,-1893.075074,2,3,"[0, 0]","[0.97907865, 0.97907865]",0.979079,"(WIN|NNP, WIN|NNP)"


In [40]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1)
          }

In [41]:
simp_cln_results = {}

for gs in gs_dict.keys():
    simp_cln_results[gs] = corr_with_gs(simp_cln_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
simp_cln_results

{'F_ENC': (185, 0.19221739015340925, 0.2152835608183971),
 'R_ENC': (13, 0.31129594706090835, 0.43709219321464243),
 'MC_VPC': (14, 0.1477406097066453, 0.1137617851495493),
 'D_ADJN': (37, 0.36341001733191636, 0.33764109902067596),
 'MC_VN': (37, 0.35083809092223195, 0.4521265564587202),
 'RE_WIKI15': (79326, 0.5555308640833292, 0.5688857603476392)}

In [42]:
simp_cln_low_results = {}

for gs in gs_dict.keys():
    simp_cln_low_results[gs] = corr_with_gs(simp_cln_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
simp_cln_low_results

{'F_ENC': (233, 0.10955715751243494, 0.10195783539750472),
 'R_ENC': (17, 0.23190201505672198, 0.2169065227134563),
 'MC_VPC': (15, 0.19210332497842636, 0.06129537111630709),
 'D_ADJN': (50, 0.2617509476946104, 0.2584818638036634),
 'MC_VN': (39, 0.3507440147572101, 0.44099631239336184),
 'RE_WIKI15': (79818, 0.5245347427379836, 0.5399180885611622)}

Wiki (10%) results

In [43]:
converter = {'ngram' : eval,
             #'stopwords' : eval
             #'mwe_vector' : eval,
             #'component_vectors' : eval,
             #'base_nearest' : eval,
             #'mwe_nearest' : eval,
            }

w10p_cln_res = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/w10p_tagged_clean_light_001.csv', 
                       converters= converter
                      )

In [44]:
w10p_cln_res['ngram_tag'] = w10p_cln_res.ngram
w10p_cln_res['ngram'] = w10p_cln_res.ngram_tag.apply(tup_rem)

w10p_cln_res

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(However, since, then)",20,-1501.356357,3,4,"[0, 0, 0]","[-0.22124107, -0.1947155, -0.2786541]",-0.231537,"(However|RB, since|IN, then|RB)"
1,"(has, been, following)",20,-1563.921228,3,4,"[1, 1, 0]","[nan, nan, -0.22919384]",-0.229194,"(has|VBZ, been|VBN, following|VBG)"
2,"(both, times, by)",21,-1608.202878,3,8,"[0, 0, 1]","[-0.26926857, -0.18902278, nan]",-0.229146,"(both|CC, times|NNS, by|IN)"
3,"(after, both, the)",20,-1643.568467,3,3,"[1, 0, 1]","[nan, -0.22798495, nan]",-0.227985,"(after|IN, both|CC, the|DT)"
4,"(However, even, after)",20,-1499.074445,3,2,"[0, 0, 1]","[-0.17959411, -0.27630407, nan]",-0.227949,"(However|RB, even|RB, after|IN)"
...,...,...,...,...,...,...,...,...,...
499995,"(:, Paleogene, from)",24,-1669.762522,3,10,[-2],[],,"(:|:, Paleogene|NNP, from|IN)"
499996,"(of, $, 3,000)",22,-1681.886235,3,10,[-2],[],,"(of|IN, $|$, 3,000|CD)"
499997,"(la, la, la)",25,-1687.517191,3,10,[-2],[],,"(la|NNP, la|NNP, la|NNP)"
499998,"(2015, -2016, school)",25,-1696.555611,3,10,[-2],[],,"(2015|CD, -2016|CD, school|NN)"


In [45]:
w10p_cln_res = w10p_cln_res[~np.isnan(w10p_cln_res.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe', 'cosine_sim' : 'cosim_w'}).reset_index(drop=True)

w10p_cln_res

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosim_w,ngram_tag
0,"(However, since, then)",20,-1501.356357,3,4,"[0, 0, 0]","[-0.22124107, -0.1947155, -0.2786541]",-0.231537,"(However|RB, since|IN, then|RB)"
1,"(has, been, following)",20,-1563.921228,3,4,"[1, 1, 0]","[nan, nan, -0.22919384]",-0.229194,"(has|VBZ, been|VBN, following|VBG)"
2,"(both, times, by)",21,-1608.202878,3,8,"[0, 0, 1]","[-0.26926857, -0.18902278, nan]",-0.229146,"(both|CC, times|NNS, by|IN)"
3,"(after, both, the)",20,-1643.568467,3,3,"[1, 0, 1]","[nan, -0.22798495, nan]",-0.227985,"(after|IN, both|CC, the|DT)"
4,"(However, even, after)",20,-1499.074445,3,2,"[0, 0, 1]","[-0.17959411, -0.27630407, nan]",-0.227949,"(However|RB, even|RB, after|IN)"
...,...,...,...,...,...,...,...,...,...
473593,"(OUL2, OUL3, CRO1)",36,-1537.197964,3,3,"[0, 0, 0]","[0.9592465, 0.95547867, 0.97482795]",0.963184,"(OUL2|NN, OUL3|NN, CRO1|NN)"
473594,"(Recurvirostridae, is)",25,-1275.232759,2,4,"[0, 1]","[0.9634324, nan]",0.963432,"(Recurvirostridae|NNP, is|VBZ)"
473595,"(Laridae, is)",29,-1486.804386,2,4,"[0, 1]","[0.965927, nan]",0.965927,"(Laridae|NNP, is|VBZ)"
473596,"(OUL3, CRO1, CRO2)",36,-1513.524083,3,2,"[0, 0, 0]","[0.9469416, 0.97366333, 0.98004997]",0.966885,"(OUL3|NN, CRO1|NN, CRO2|NN)"


In [46]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1),
           'SIMP_CLN' : (simp_cln_res,'cosine_sim',1)
          }

In [47]:
w10p_cln_results = {}

for gs in gs_dict.keys():
    w10p_cln_results[gs] = corr_with_gs(w10p_cln_res,'cosim_w', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
w10p_cln_results

{'F_ENC': (85, 0.40210269557435935, 0.43961139646416136),
 'R_ENC': (10, 0.43030303030303024, 0.2957728492858003),
 'MC_VPC': (33, 0.10338509004332604, 0.09433848777417968),
 'D_ADJN': (9, 0.4117792453398488, 0.4680561982240944),
 'MC_VN': (48, 0.09221705102476564, 0.13106046365452512),
 'RE_WIKI15': (118713, 0.6059935849286735, 0.614088089608807),
 'SIMP_CLN': (7195, 0.637405758213973, 0.6523061701448993)}

In [48]:
w10p_cln_low_results = {}

for gs in gs_dict.keys():
    w10p_cln_low_results[gs] = corr_with_gs(w10p_cln_res,'cosim_w', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
w10p_cln_low_results

{'F_ENC': (240, 0.16582141912531387, 0.178266045426923),
 'R_ENC': (33, 0.2740344768732269, 0.24561427171704078),
 'MC_VPC': (41, 0.023144184244394916, 0.049946724373688765),
 'D_ADJN': (46, 0.13771828666123542, 0.19613921364612638),
 'MC_VN': (56, 0.0198032488170276, 0.05403398343492424),
 'RE_WIKI15': (131840, 0.5599330982060865, 0.56116313483496),
 'SIMP_CLN': (16789, 0.5257183831443918, 0.53523878133824)}

## Results with proper noun filtering
Removed n-grams with all tokens tagged as NNP/NNPS (including possessive versions)

In [49]:
converter = {'ngram' : eval,
             #'stopwords' : eval
             #'mwe_vector' : eval,
             #'component_vectors' : eval,
             #'base_nearest' : eval,
             #'mwe_nearest' : eval,
            }

simp_nopn_res = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/simp_tagged_nopn_light_001.csv', 
                       converters= converter
                      )

In [50]:
simp_nopn_res['ngram_tag'] = simp_nopn_res.ngram
simp_nopn_res['ngram'] = simp_nopn_res.ngram_tag.apply(tup_rem)

simp_nopn_res

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2115.954533,3,7,"[1, 1, 0]","[nan, nan, -0.17546517]",-0.175465,"(was|VBD, born|VBN, here|RB)"
1,"(and, finally, the)",33,-2348.089994,3,7,"[1, 0, 1]","[nan, -0.17055225, nan]",-0.170552,"(and|CC, finally|RB, the|DT)"
2,"(born, here)",41,-1946.682114,2,6,"[1, 0]","[nan, -0.16723849]",-0.167238,"(born|VBN, here|RB)"
3,"(is, both, the)",35,-2521.230138,3,7,"[1, 0, 1]","[nan, -0.15586732, nan]",-0.155867,"(is|VBZ, both|CC, the|DT)"
4,"(to, found, the)",25,-1819.659553,3,2,"[1, 0, 1]","[nan, -0.14139682, nan]",-0.141397,"(to|IN, found|VBN, the|DT)"
...,...,...,...,...,...,...,...,...,...
149995,"(1, Chemung, County)",52,-2990.167422,3,10,"[1, -1, 0]","[nan, nan, 0.6229615]",,"(1|CD, Chemung|NNP, County|NNP)"
149996,"(1, Tioga, County)",60,-3442.616347,3,10,"[1, -1, 0]","[nan, nan, 0.63835496]",,"(1|CD, Tioga|NNP, County|NNP)"
149997,"(state, state, state)",48,-2965.054834,3,13,[-2],[],,"(state|NN, state|NN, state|NN)"
149998,"(I, Kissed, a)",35,-2085.513275,3,14,"[0, -1, 1]","[0.1323022, nan, nan]",,"(I|PRP, Kissed|VBD, a|DT)"


In [51]:
simp_nopn_res = simp_nopn_res[~np.isnan(simp_nopn_res.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe'}).reset_index(drop=True)

simp_nopn_res

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(was, born, here)",32,-2115.954533,3,7,"[1, 1, 0]","[nan, nan, -0.17546517]",-0.175465,"(was|VBD, born|VBN, here|RB)"
1,"(and, finally, the)",33,-2348.089994,3,7,"[1, 0, 1]","[nan, -0.17055225, nan]",-0.170552,"(and|CC, finally|RB, the|DT)"
2,"(born, here)",41,-1946.682114,2,6,"[1, 0]","[nan, -0.16723849]",-0.167238,"(born|VBN, here|RB)"
3,"(is, both, the)",35,-2521.230138,3,7,"[1, 0, 1]","[nan, -0.15586732, nan]",-0.155867,"(is|VBZ, both|CC, the|DT)"
4,"(to, found, the)",25,-1819.659553,3,2,"[1, 0, 1]","[nan, -0.14139682, nan]",-0.141397,"(to|IN, found|VBN, the|DT)"
...,...,...,...,...,...,...,...,...,...
145601,"(DNQ, DNQ)",113,-4174.165390,2,4,"[0, 0]","[0.96868503, 0.96868503]",0.968685,"(DNQ|NN, DNQ|NN)"
145602,"(3, -0)",61,-2648.449939,2,2,"[1, 0]","[nan, 0.97882754]",0.978828,"(3|CD, -0|SYM)"
145603,"(Maerz, and)",49,-2302.220234,2,2,"[0, 1]","[0.97951883, nan]",0.979519,"(Maerz|NNP, and|CC)"
145604,"(GBR, GER)",31,-1020.735228,2,4,"[0, 0]","[0.97934204, 0.9822897]",0.980816,"(GBR|NN, GER|NN)"


In [52]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1)
          }

In [53]:
simp_nopn_results = {}

for gs in gs_dict.keys():
    simp_nopn_results[gs] = corr_with_gs(simp_nopn_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
simp_nopn_results

{'F_ENC': (210, 0.17061638984666755, 0.19954712832600852),
 'R_ENC': (17, 0.312232574580384, 0.4469984941590157),
 'MC_VPC': (15, 0.13978584413286751, 0.09076594620784136),
 'D_ADJN': (44, 0.3234060648948002, 0.2998132230982493),
 'MC_VN': (39, 0.3585551167197633, 0.43022294223059876),
 'RE_WIKI15': (79047, 0.6221155445728576, 0.6284548120728505)}

In [54]:
simp_nopn_low_results = {}

for gs in gs_dict.keys():
    simp_nopn_low_results[gs] = corr_with_gs(simp_nopn_res,'cosine_sim', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
simp_nopn_low_results

{'F_ENC': (228, 0.13013537276805118, 0.15275661698423235),
 'R_ENC': (20, 0.2711869215727743, 0.39437120109987356),
 'MC_VPC': (16, 0.19068991353923495, 0.03790067321279876),
 'D_ADJN': (52, 0.26111278334538285, 0.2451324903283014),
 'MC_VN': (40, 0.38331455079214344, 0.4417636968180419),
 'RE_WIKI15': (79099, 0.5791301007276244, 0.5873246499013689)}

In [55]:
converter = {'ngram' : eval,
             #'stopwords' : eval
             #'mwe_vector' : eval,
             #'component_vectors' : eval,
             #'base_nearest' : eval,
             #'mwe_nearest' : eval,
            }

w10p_nopn_res = pd.read_csv(datapath+'/Models/1 w2v/Tagged/Results/w10p_tagged_nopn_light_001.csv', 
                       converters= converter
                      )

In [56]:
w10p_nopn_res['ngram_tag'] = w10p_nopn_res.ngram
w10p_nopn_res['ngram'] = w10p_nopn_res.ngram_tag.apply(tup_rem)

w10p_nopn_res

Unnamed: 0,ngram,freq,poisson,len,batch,stopwords,component_cosims,cosine_sim,ngram_tag
0,"(one, on, both)",22,-1743.834796,3,4,"[1, 1, 0]","[nan, nan, -0.24756436]",-0.247564,"(one|CD, on|IN, both|CC)"
1,"(were, once, the)",20,-1599.272787,3,7,"[1, 0, 1]","[nan, -0.23794496, nan]",-0.237945,"(were|VBD, once|IN, the|DT)"
2,"(The, four, were)",21,-1718.246421,3,1,"[1, 0, 1]","[nan, -0.23560974, nan]",-0.235610,"(The|DT, four|CD, were|VBD)"
3,"(In, addition, their)",20,-1565.757225,3,8,"[1, 0, 1]","[nan, -0.227985, nan]",-0.227985,"(In|IN, addition|NN, their|PRP$)"
4,"(was, subsequently, the)",21,-1759.767362,3,7,"[1, 0, 1]","[nan, -0.22302704, nan]",-0.223027,"(was|VBD, subsequently|RB, the|DT)"
...,...,...,...,...,...,...,...,...,...
499995,"(of, I, 've)",23,-1786.381671,3,10,[-2],[],,"(of|IN, I|PRP, 've|VBP)"
499996,"((, BN, –)",28,-1788.082423,3,10,[-2],[],,"((|-LRB-, BN|NN, –|SYM)"
499997,"(/, 1,000, live)",27,-1810.125560,3,10,[-2],[],,"(/|SYM, 1,000|CD, live|JJ)"
499998,"(/, vocals)",34,-1810.642550,2,10,[-2],[],,"(/|,, vocals|NNS)"


In [60]:
w10p_nopn_res = w10p_nopn_res[~np.isnan(w10p_nopn_res.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe', 'cosine_sim' : 'cosim_wnpn'}).reset_index(drop=True)

w10p_nopn_res

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosim_wnpn,ngram_tag
0,"(one, on, both)",22,-1743.834796,3,4,"[1, 1, 0]","[nan, nan, -0.24756436]",-0.247564,"(one|CD, on|IN, both|CC)"
1,"(were, once, the)",20,-1599.272787,3,7,"[1, 0, 1]","[nan, -0.23794496, nan]",-0.237945,"(were|VBD, once|IN, the|DT)"
2,"(The, four, were)",21,-1718.246421,3,1,"[1, 0, 1]","[nan, -0.23560974, nan]",-0.235610,"(The|DT, four|CD, were|VBD)"
3,"(In, addition, their)",20,-1565.757225,3,8,"[1, 0, 1]","[nan, -0.227985, nan]",-0.227985,"(In|IN, addition|NN, their|PRP$)"
4,"(was, subsequently, the)",21,-1759.767362,3,7,"[1, 0, 1]","[nan, -0.22302704, nan]",-0.223027,"(was|VBD, subsequently|RB, the|DT)"
...,...,...,...,...,...,...,...,...,...
479576,"(Laridae, is)",29,-1486.804386,2,2,"[0, 1]","[0.9633833, nan]",0.963383,"(Laridae|NNP, is|VBZ)"
479577,"(OUL3, CRO1, CRO2)",36,-1513.524083,3,2,"[0, 0, 0]","[0.9517838, 0.9673035, 0.97172594]",0.963604,"(OUL3|NN, CRO1|NN, CRO2|NN)"
479578,"(Recurvirostridae, is, a)",25,-1825.463688,3,5,"[0, 1, 1]","[0.9652813, nan, nan]",0.965281,"(Recurvirostridae|NNP, is|VBZ, a|DT)"
479579,"(Recurvirostridae, is)",25,-1275.232759,2,2,"[0, 1]","[0.9660529, nan]",0.966053,"(Recurvirostridae|NNP, is|VBZ)"


In [59]:
gs_dict = {'F_ENC' : (F_ENC,'gs_score',-1), 
           'R_ENC' : (R_ENC,'gs_score',-1), 
           'MC_VPC': (MC_VPC,'gs_score',-1), 
           'D_ADJN': (D_ADJN,'gs_score',-1), 
           'MC_VN' : (MC_VN,'gs_score',-1),
           'RE_WIKI15' : (robegg_df,'compo',1),
           'SIMP_NOPN' : (simp_nopn_res,'cosine_sim',1)
          }

In [61]:
w10p_nopn_results = {}

for gs in gs_dict.keys():
    w10p_nopn_results[gs] = corr_with_gs(w10p_nopn_res,'cosim_wnpn', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1])
    
w10p_nopn_results

{'F_ENC': (95, 0.4266721721506415, 0.45486730180075524),
 'R_ENC': (10, 0.3939393939393939, 0.4190778237989436),
 'MC_VPC': (35, 0.06583859874083127, 0.07077462858090816),
 'D_ADJN': (10, 0.38415348299103563, 0.49724820103614964),
 'MC_VN': (52, 0.16808933397001719, 0.21351519046457657),
 'RE_WIKI15': (108741, 0.6715535041774463, 0.67720591179744),
 'SIMP_NOPN': (7698, 0.6267120295488908, 0.6406797435046593)}

In [62]:
w10p_nopn_low_results = {}

for gs in gs_dict.keys():
    w10p_nopn_low_results[gs] = corr_with_gs(w10p_nopn_res,'cosim_wnpn', gs_dict[gs][0],  tiebreak = 'poisson', refscore = gs_dict[gs][1], mult = gs_dict[gs][-1], lowcase = True)
    
w10p_nopn_low_results

{'F_ENC': (204, 0.1436148238189435, 0.15352605797589713),
 'R_ENC': (21, 0.36115623859964363, 0.3314016954284363),
 'MC_VPC': (43, 0.11005127666866041, 0.13655974090083092),
 'D_ADJN': (33, 0.30239181003440924, 0.3432912398301981),
 'MC_VN': (59, 0.08932539939212138, 0.10266876389855156),
 'RE_WIKI15': (123372, 0.6172770425319167, 0.6165229150438205),
 'SIMP_NOPN': (18950, 0.5191598243497096, 0.5288681376884276)}

# Investigations

In [33]:
import pickle

In [34]:
# Investigating lowered number of matches to reference sets - load long lists

with open(datapath+'/Corpora/wiki/enwiki_20200520/Tagged/stop.pkl', 'rb') as pfile:
    stop = pickle.load(pfile)
    
stop

{'%|NN',
 '0|CD',
 '1|CD',
 '2|CD',
 '3|CD',
 'He|PRP',
 'In|IN',
 'It|PRP',
 'The|DT',
 'also|RB',
 'and|CC',
 'an|DT',
 'are|VBP',
 'as|IN',
 'at|IN',
 'a|DT',
 'be|VB',
 'but|CC',
 'by|IN',
 'first|JJ',
 'for|IN',
 'from|IN',
 'had|VBD',
 'has|VBZ',
 'he|PRP',
 'his|PRP$',
 'in|IN',
 'is|VBZ',
 'its|PRP$',
 'it|PRP',
 'not|RB',
 'of|IN',
 'one|CD',
 'on|IN',
 'or|CC',
 's|POS',
 'that|IN',
 'that|WDT',
 'their|PRP$',
 'the|DT',
 'to|IN',
 'to|TO',
 'two|CD',
 'was|VBD',
 'were|VBD',
 'which|WDT',
 'who|WP',
 'with|IN',
 '|HYPH',
 '–|SYM'}

In [35]:
ngram_eval = pd.read_pickle(datapath+'/Corpora/wiki/enwiki_20200520/Tagged/ngram_eval.pkl')

ngram_eval

Unnamed: 0,ngram,freq,poisson,len,batch
0,"(311.22|CD, 500.86|CD)",20,-667.084219,2,1
1,"(Makhaya|NNP, Ntini|NNP)",20,-667.084219,2,1
2,"(resting_place_coordinates|NNS, burial_place|VBP)",20,-667.084219,2,1
3,"(Dechawat|NNP, Poomjaeng|NNP)",20,-667.084219,2,1
4,"(MSC1|NN, MSC2|NN)",20,-667.084219,2,1
...,...,...,...,...,...
499995,"(US|NNP, Steel|NNP)",32,-1679.033919,2,3
499996,"(Harry|NNP, Wilson|NNP)",32,-1679.034986,2,3
499997,"(a|DT, solo|JJ, piano|NN)",23,-1679.035464,3,2
499998,"(1962|CD, album|NN)",30,-1679.036008,2,6


In [36]:
ngram = pd.read_pickle(datapath+'/Corpora/wiki/enwiki_20200520/Tagged/ngram_df.pkl')

ngram

Unnamed: 0,ngram,freq,poisson,len
0,"(311.22|CD, 500.86|CD)",20,-6.670842e+02,2
1,"(Makhaya|NNP, Ntini|NNP)",20,-6.670842e+02,2
2,"(resting_place_coordinates|NNS, burial_place|VBP)",20,-6.670842e+02,2
3,"(Dechawat|NNP, Poomjaeng|NNP)",20,-6.670842e+02,2
4,"(MSC1|NN, MSC2|NN)",20,-6.670842e+02,2
...,...,...,...,...
1859185,"(on|IN, the|DT)",473024,-2.576335e+07,2
1859186,"(and|CC, the|DT)",492576,-2.778679e+07,2
1859187,"(to|IN, the|DT)",714100,-3.884106e+07,2
1859188,"(in|IN, the|DT)",1388181,-7.588717e+07,2


In [37]:
ngram['ngram_tag'] = ngram.ngram
ngram['ngram'] = ngram.ngram_tag.apply(tup_rem)

ngram

Unnamed: 0,ngram,freq,poisson,len,ngram_tag
0,"(311.22, 500.86)",20,-6.670842e+02,2,"(311.22|CD, 500.86|CD)"
1,"(Makhaya, Ntini)",20,-6.670842e+02,2,"(Makhaya|NNP, Ntini|NNP)"
2,"(resting_place_coordinates, burial_place)",20,-6.670842e+02,2,"(resting_place_coordinates|NNS, burial_place|VBP)"
3,"(Dechawat, Poomjaeng)",20,-6.670842e+02,2,"(Dechawat|NNP, Poomjaeng|NNP)"
4,"(MSC1, MSC2)",20,-6.670842e+02,2,"(MSC1|NN, MSC2|NN)"
...,...,...,...,...,...
1859185,"(on, the)",473024,-2.576335e+07,2,"(on|IN, the|DT)"
1859186,"(and, the)",492576,-2.778679e+07,2,"(and|CC, the|DT)"
1859187,"(to, the)",714100,-3.884106e+07,2,"(to|IN, the|DT)"
1859188,"(in, the)",1388181,-7.588717e+07,2,"(in|IN, the|DT)"


In [38]:
ngram = ngram.rename(columns={'ngram' : 'mwe'})

In [93]:
corr_with_gs(ngram, 'poisson', F_ENC, refscore='gs_score', tiebreak = 'freq', lowcase = False, mult=-1)

(683, 0.1248773442571316, 0.09286634145590356)

In [39]:
ngram_eval['ngram_tag'] = ngram_eval.ngram
ngram_eval['ngram'] = ngram_eval.ngram_tag.apply(tup_rem)

ngram_eval = ngram_eval.rename(columns={'ngram' : 'mwe'})

In [95]:
corr_with_gs(ngram, 'poisson', F_ENC, refscore='gs_score', tiebreak = 'freq', lowcase = False, mult=-1)

(683, 0.1248773442571316, 0.09286634145590356)

In [40]:
ngram2 = ngram[:1650000]

In [97]:
corr_with_gs(ngram2, 'poisson', F_ENC, refscore='gs_score', tiebreak = 'freq', lowcase = False, mult=-1)

(457, 0.046601180634674895, 0.015429356936201467)

In [41]:
checky = ngram[['mwe', 'freq', 'poisson']].merge(F_ENC, 
                          left_on=ngram.mwe, 
                          right_on = F_ENC.mwe, 
                          how='left')

In [42]:
checky[~np.isnan(checky.gs_score)]

Unnamed: 0,key_0,mwe_x,freq,poisson,mwe_y,gs_score
13106,"(lecture, notes)","(lecture, notes)",22,-960.544055,"(lecture, notes)",0.0
13499,"(mass, media)","(mass, media)",20,-962.678709,"(mass, media)",2.0
13818,"(hype, man)","(hype, man)",20,-964.457053,"(hype, man)",3.0
14502,"(cheek, teeth)","(cheek, teeth)",21,-968.324445,"(cheek, teeth)",3.0
19993,"(media, commentators)","(media, commentators)",20,-995.117244,"(media, commentators)",0.0
...,...,...,...,...,...,...
1853863,"(science, fiction)","(science, fiction)",4510,-202383.170972,"(science, fiction)",0.0
1855615,"(debut, album)","(debut, album)",5623,-274006.258700,"(debut, album)",0.0
1855669,"(radio, station)","(radio, station)",5747,-276729.519270,"(radio, station)",0.0
1856709,"(video, game)","(video, game)",7158,-347508.952280,"(video, game)",1.0


In [43]:
checky = checky[~np.isnan(checky.gs_score)].reset_index()
checky

Unnamed: 0,index,key_0,mwe_x,freq,poisson,mwe_y,gs_score
0,13106,"(lecture, notes)","(lecture, notes)",22,-960.544055,"(lecture, notes)",0.0
1,13499,"(mass, media)","(mass, media)",20,-962.678709,"(mass, media)",2.0
2,13818,"(hype, man)","(hype, man)",20,-964.457053,"(hype, man)",3.0
3,14502,"(cheek, teeth)","(cheek, teeth)",21,-968.324445,"(cheek, teeth)",3.0
4,19993,"(media, commentators)","(media, commentators)",20,-995.117244,"(media, commentators)",0.0
...,...,...,...,...,...,...,...
732,1853863,"(science, fiction)","(science, fiction)",4510,-202383.170972,"(science, fiction)",0.0
733,1855615,"(debut, album)","(debut, album)",5623,-274006.258700,"(debut, album)",0.0
734,1855669,"(radio, station)","(radio, station)",5747,-276729.519270,"(radio, station)",0.0
735,1856709,"(video, game)","(video, game)",7158,-347508.952280,"(video, game)",1.0


In [142]:
indexer = 80
checky[indexer:indexer+20]

Unnamed: 0,index,key_0,mwe_x,freq,poisson,mwe_y,gs_score
80,502236,"(tax, haven)","(tax, haven)",36,-1681.658395,"(tax, haven)",4.0
81,508598,"(target, cell)","(target, cell)",33,-1689.424204,"(target, cell)",1.0
82,513550,"(freelance, work)","(freelance, work)",34,-1695.311851,"(freelance, work)",0.0
83,517032,"(back, pocket)","(back, pocket)",35,-1699.481858,"(back, pocket)",0.0
84,520599,"(death, notice)","(death, notice)",33,-1703.946015,"(death, notice)",0.0
85,526275,"(back, door)","(back, door)",32,-1710.707878,"(back, door)",0.0
86,543686,"(film, world)","(film, world)",30,-1731.910769,"(film, world)",0.0
87,548489,"(space, race)","(space, race)",32,-1737.756404,"(space, race)",4.0
88,554056,"(language, contact)","(language, contact)",33,-1744.859747,"(language, contact)",1.0
89,576594,"(government, work)","(government, work)",31,-1774.170539,"(government, work)",0.0


In [190]:
indexer = 482500
step= 40

ngram_eval[indexer:indexer+step]

Unnamed: 0,mwe,freq,poisson,len,batch,ngram_tag
482500,"(band, also, appeared)",22,-1658.68096,3,3,"(band|NN, also|RB, appeared|VBD)"
482501,"(only, an, hour)",22,-1658.681735,3,3,"(only|RB, an|DT, hour|NN)"
482502,"(varsity, soccer)",35,-1658.683447,2,2,"(varsity|NN, soccer|NN)"
482503,"(p, 50)",31,-1658.684325,2,2,"(p|NN, 50|CD)"
482504,"(makes, the, claim)",22,-1658.686033,3,1,"(makes|VBZ, the|DT, claim|NN)"
482505,"(worked, on, by)",21,-1658.686716,3,6,"(worked|VBN, on|IN, by|IN)"
482506,"(of, illness, or)",21,-1658.687431,3,3,"(of|IN, illness|NN, or|CC)"
482507,"(the, ballot, or)",21,-1658.688718,3,5,"(the|DT, ballot|NN, or|CC)"
482508,"(en, , Yvelines)",25,-1658.689364,3,3,"(en|IN, |HYPH, Yvelines|NNP)"
482509,"(1997, South)",29,-1658.694543,2,5,"(1997|CD, South|NNP)"


In [44]:
import re

In [45]:
# Filtering patterns

_tagnums = re.compile("(\d+(\.\d+)?$)|(.+\|CD$)")  # All numeric, with optional decimal point OR anything tagged |CD
_nans    = re.compile("nan", re.I)                 # Occasional lists like (NaN, NaN, NaN)
_timerad = re.compile("[apf]\.?m.?", re.I)         # a.m. P.M, FM - time and radio station indicators

def tupallmatch(tupin, pattern):
    return all(pattern.match(w) for w in tupin)

def tupmatchlist(tupin, patterns):
    for w,p in zip(tupin, patterns):
        if p.match(w): pass
        else: return False
    return True

In [266]:
teststr = 'a.M'
_timerad.match(teststr)

<_sre.SRE_Match object; span=(0, 3), match='a.M'>

In [252]:
testtup = ('311.22|CD', 'males|NNS')

tupmatchlist(testtup, [_tagnums, re.compile('males\|NNS')])

True

In [267]:
# initially;    1,859,190 
ngrams_filt = ngram[~ngram.ngram_tag.apply(tupallmatch , args= [_tagnums])]
# all numeric;  1,834,686
ngrams_filt = ngrams_filt[~ngrams_filt.ngram_tag.apply(tupmatchlist, args = [[_tagnums, re.compile('males\|NNS')]])]
# x|CD males;   1,834,542
ngrams_filt = ngrams_filt[~ngrams_filt.ngram_tag.apply(tupallmatch , args= [_nans])]
# NaN NaN;      1,834,540
ngrams_filt = ngrams_filt[~ngrams_filt.ngram_tag.apply(tupmatchlist, args = [[_tagnums, _timerad]])]
# Times, radio; 1,833,811


In [268]:
ngrams_filt

Unnamed: 0,mwe,freq,poisson,len,ngram_tag
1,"(Makhaya, Ntini)",20,-6.670842e+02,2,"(Makhaya|NNP, Ntini|NNP)"
2,"(resting_place_coordinates, burial_place)",20,-6.670842e+02,2,"(resting_place_coordinates|NNS, burial_place|VBP)"
3,"(Dechawat, Poomjaeng)",20,-6.670842e+02,2,"(Dechawat|NNP, Poomjaeng|NNP)"
4,"(MSC1, MSC2)",20,-6.670842e+02,2,"(MSC1|NN, MSC2|NN)"
5,"(Deshin, Shekpa)",20,-6.684920e+02,2,"(Deshin|NNP, Shekpa|NNP)"
...,...,...,...,...,...
1859185,"(on, the)",473024,-2.576335e+07,2,"(on|IN, the|DT)"
1859186,"(and, the)",492576,-2.778679e+07,2,"(and|CC, the|DT)"
1859187,"(to, the)",714100,-3.884106e+07,2,"(to|IN, the|DT)"
1859188,"(in, the)",1388181,-7.588717e+07,2,"(in|IN, the|DT)"


Combined reference set

In [46]:
all_refs = MC_VN.rename(columns={'gs_score' : 'MC_VN'}).merge(F_ENC.rename(columns={'gs_score' : 'F_ENC'}),how='outer')

all_refs = all_refs.merge(R_ENC.rename(columns={'gs_score' : 'R_ENC'}),how='outer')

all_refs = all_refs.merge(MC_VPC.rename(columns={'gs_score' : 'MC_VPC'}),how='outer')

all_refs = all_refs.merge(D_ADJN.rename(columns={'gs_score' : 'D_ADJN'}),how='outer')

In [47]:
# 2,021 if there is no overlap between the sets, get 1,996

all_refs

Unnamed: 0,mwe,MC_VN,F_ENC,R_ENC,MC_VPC,D_ADJN
0,"(have, heart)",5.0,,,,
1,"(catch, eye)",5.0,,,,
2,"(take, step)",5.0,,,,
3,"(take, root)",5.0,,,,
4,"(leave, mark)",5.0,,,,
...,...,...,...,...,...,...
1991,"(short, distance)",,,,,3.0
1992,"(early, version)",,,,,2.0
1993,"(small, island)",,,,,2.0
1994,"(olive, oil)",,,,,1.0


In [48]:
# Scale each column to 0-1000

def rescaler(x, col, scale = 1):
    mn = col.min()
    mx = col.max()
    
    return scale * (x - mn) / (mx - mn)

In [49]:
all_refs['MC_VN'] = all_refs.MC_VN.apply(rescaler, col = all_refs.MC_VN, scale = 1000)
all_refs['F_ENC'] = all_refs.F_ENC.apply(rescaler, col = all_refs.F_ENC, scale = 1000)
all_refs['R_ENC'] = all_refs.R_ENC.apply(rescaler, col = all_refs.R_ENC, scale = 1000)
all_refs['MC_VPC'] = all_refs.MC_VPC.apply(rescaler, col = all_refs.MC_VPC, scale = 1000)
all_refs['D_ADJN'] = all_refs.D_ADJN.apply(rescaler, col = all_refs.D_ADJN, scale = 1000)

In [50]:
all_refs

Unnamed: 0,mwe,MC_VN,F_ENC,R_ENC,MC_VPC,D_ADJN
0,"(have, heart)",1000.0,,,,
1,"(catch, eye)",1000.0,,,,
2,"(take, step)",1000.0,,,,
3,"(take, root)",1000.0,,,,
4,"(leave, mark)",1000.0,,,,
...,...,...,...,...,...,...
1991,"(short, distance)",,,,,33.707865
1992,"(early, version)",,,,,22.471910
1993,"(small, island)",,,,,22.471910
1994,"(olive, oil)",,,,,11.235955


In [51]:
all_refs['REF'] = all_refs.drop('mwe', axis=1).apply(np.nanmean, axis=1)

In [52]:
all_refs

Unnamed: 0,mwe,MC_VN,F_ENC,R_ENC,MC_VPC,D_ADJN,REF
0,"(have, heart)",1000.0,,,,,1000.000000
1,"(catch, eye)",1000.0,,,,,1000.000000
2,"(take, step)",1000.0,,,,,1000.000000
3,"(take, root)",1000.0,,,,,1000.000000
4,"(leave, mark)",1000.0,,,,,1000.000000
...,...,...,...,...,...,...,...
1991,"(short, distance)",,,,,33.707865,33.707865
1992,"(early, version)",,,,,22.471910,22.471910
1993,"(small, island)",,,,,22.471910,22.471910
1994,"(olive, oil)",,,,,11.235955,11.235955


Looking for lost items between original and POS-tagged experiments on 10% corpus

In [54]:
w10p_w2v

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosim_10w,ngram_tag
0,"(delivered, from)",28,-1567.428918,2,5,"[0, 1]","[-0.2535712, nan]",-0.253571,"(delivered|VBD, from|IN)"
1,"(then, had, two)",20,-1586.812073,3,3,"[0, 1, 1]","[-0.24625674, nan, nan]",-0.246257,"(then|RB, had|VBD, two|CD)"
2,"(and, finally, for)",20,-1647.756550,3,4,"[1, 0, 1]","[nan, -0.22293654, nan]",-0.222937,"(and|CC, finally|RB, for|IN)"
3,"(one, before, it)",20,-1587.727730,3,7,"[1, 0, 1]","[nan, -0.21940775, nan]",-0.219408,"(one|CD, before|IN, it|PRP)"
4,"(However, in, their)",20,-1642.620348,3,3,"[0, 1, 1]","[-0.2108197, nan, nan]",-0.210820,"(However|RB, in|IN, their|PRP$)"
...,...,...,...,...,...,...,...,...,...
482425,"(środek.svgFile, Wygaszony, środek.svgFile)",36,-1588.358559,3,1,"[0, 0, 0]","[0.9717464, 0.9375481, 0.9717464]",0.960347,"(środek.svgFile|NNP, Wygaszony|NNP, środek.svg..."
482426,"(Recurvirostridae, is)",25,-1276.332579,2,4,"[0, 1]","[0.96257687, nan]",0.962577,"(Recurvirostridae|NNP, is|VBZ)"
482427,"(OUL3, CRO1, CRO2)",36,-1515.104871,3,2,"[0, 0, 0]","[0.94491136, 0.97224206, 0.97286904]",0.963341,"(OUL3|NN, CRO1|NN, CRO2|NN)"
482428,"(Lapu, -)",22,-1127.048317,2,2,"[0, 1]","[0.9648907, nan]",0.964891,"(Lapu|NNP, |HYPH)"


In [55]:
w10p_tag = all_refs.merge(w10p_w2v[['mwe', 'cosim_10w']], how='inner', on='mwe').drop_duplicates(subset=['mwe'])

In [58]:
w10p_tag = all_refs.merge(w10p_w2v[['mwe', 'cosim_10w', 'ngram_tag']], how='inner', on='mwe')

In [60]:
ind = 0
w10p_tag[ind:ind+30]

Unnamed: 0,mwe,MC_VN,F_ENC,R_ENC,MC_VPC,D_ADJN,REF,cosim_10w,ngram_tag
0,"(shed, light)",1000.0,,,,,1000.0,0.025012,"(shed|VBD, light|NN)"
1,"(shed, light)",1000.0,,,,,1000.0,0.109381,"(shed|VBN, light|NN)"
2,"(see, fit)",900.0,,,,,900.0,0.164587,"(see|VBP, fit|JJ)"
3,"(bear, witness)",900.0,,,,,900.0,0.196912,"(bear|VBP, witness|NN)"
4,"(bear, witness)",900.0,,,,,900.0,0.208045,"(bear|VB, witness|NN)"
5,"(take, leave)",800.0,,,,,800.0,0.313061,"(take|VB, leave|NN)"
6,"(cause, damage)",800.0,,,,,800.0,0.382958,"(cause|VBP, damage|NN)"
7,"(have, control)",800.0,,,,,800.0,0.102348,"(have|VBP, control|NN)"
8,"(have, faith)",800.0,,,,,800.0,0.206265,"(have|VBP, faith|NN)"
9,"(have, faith)",800.0,,,,,800.0,0.230265,"(have|VB, faith|NN)"


In [116]:
w10p_w2v_orig = pd.read_csv(datapath+'/Models/1 w2v/Results/wiki10pc_light_001.csv', 
                       converters= converter
                      )

In [117]:
w10p_w2v_orig = w10p_w2v_orig[~np.isnan(w10p_w2v_orig.cosine_sim)].sort_values('cosine_sim').rename(columns={'ngram' : 'mwe', 'cosine_sim' : 'cosim_10w'}).reset_index(drop=True)

In [118]:
w10p_w2v_orig

Unnamed: 0,mwe,poisson,len,batch,stopwords,component_cosims,cosim_10w
0,"(1980s, 1970s, 1960s)",504.762099,3,8,"[0, 0, 0]","[-0.23996054, -0.23782371, -0.2340202]",-0.237268
1,"(amongst, many, others)",712.565055,3,1,"[0, 0, 0]","[-0.15940933, -0.21269974, -0.19618905]",-0.189433
2,"(European, register, of)",588.560764,3,1,"[0, 0, 1]","[-0.1621425, -0.21278596, nan]",-0.187464
3,"(1950s, 1940s, 1930s)",576.491489,3,10,"[0, 0, 0]","[-0.18906975, -0.1606813, -0.19921347]",-0.182988
4,"(As, far, back)",449.645748,3,3,"[0, 0, 0]","[-0.09595686, -0.2087354, -0.24125503]",-0.181982
...,...,...,...,...,...,...,...
453464,"(宿, け/犬)",416.659769,2,2,"[0, 0]","[0.9859951, 0.9874592]",0.986727
453465,"(ㄴ, ㄹ)",4110.216943,2,5,"[0, 0]","[0.98334867, 0.99101293]",0.987181
453466,"(ㄹ, ㅁ)",3227.287802,2,6,"[0, 0]","[0.99340194, 0.98948646]",0.991444
453467,"(ㄹ, ㅁ, ㅂ)",5594.044824,3,4,"[0, 0, 0]","[0.9960094, 0.99351, 0.9925519]",0.994024


In [124]:
w10p_orig = all_refs.merge(w10p_w2v_orig[['mwe', 'cosim_10w']], how='inner', on='mwe').drop_duplicates()

In [125]:
w10p_orig

Unnamed: 0,mwe,MC_VN,F_ENC,R_ENC,MC_VPC,D_ADJN,REF,cosim_10w
0,"(set, foot)",1000.0,,,,,1000.000000,0.008091
1,"(shake, hands)",1000.0,,,,,1000.000000,0.336349
2,"(shed, light)",1000.0,,,,,1000.000000,0.097372
3,"(give, birth)",900.0,,,,,900.000000,0.372657
4,"(take, part)",900.0,,,,,900.000000,0.328196
...,...,...,...,...,...,...,...,...
691,"(short, distance)",,,,,33.707865,33.707865,0.319870
692,"(early, version)",,,,,22.471910,22.471910,0.344406
693,"(small, island)",,,,,22.471910,22.471910,0.527137
694,"(olive, oil)",,,,,11.235955,11.235955,0.521160


In [131]:
kept = w10p_orig.merge(w10p_tag, on='mwe', how='inner').drop_duplicates()
kept

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y
0,"(shed, light)",1000.0,,,,,1000.0,0.097372,1000.0,,,,,1000.0,0.025012
1,"(bear, witness)",900.0,,,,,900.0,0.235828,900.0,,,,,900.0,0.196912
2,"(cause, damage)",800.0,,,,,800.0,0.529138,800.0,,,,,800.0,0.382958
3,"(put, pressure)",800.0,,,,,800.0,0.331696,800.0,,,,,800.0,0.114471
4,"(raise, money)",700.0,,,,,700.0,0.402098,700.0,,,,,700.0,0.205812
5,"(take, refuge)",700.0,,,,,700.0,0.45295,700.0,,,,,700.0,0.228891
6,"(pay, tribute)",700.0,,,,,700.0,0.413777,700.0,,,,,700.0,0.218673
7,"(return, home)",600.0,,,,,600.0,0.475525,600.0,,,,,600.0,0.258523
8,"(make, sense)",600.0,,,,,600.0,0.459291,600.0,,,,,600.0,0.204494
9,"(gain, experience)",500.0,,,,,500.0,0.415918,500.0,,,,,500.0,0.04564


In [132]:
lost = w10p_orig.merge(w10p_tag, on='mwe', how='left').drop_duplicates()
lost = lost[np.isnan(lost.cosim_10w_y)]
lost

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y
0,"(set, foot)",1000.0,,,,,1000.000000,0.008091,,,,,,,
1,"(shake, hands)",1000.0,,,,,1000.000000,0.336349,,,,,,,
3,"(give, birth)",900.0,,,,,900.000000,0.372657,,,,,,,
4,"(take, part)",900.0,,,,,900.000000,0.328196,,,,,,,
5,"(turn, back)",900.0,,,,,900.000000,0.428909,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,"(short, distance)",,,,,33.707865,33.707865,0.319870,,,,,,,
692,"(early, version)",,,,,22.471910,22.471910,0.344406,,,,,,,
693,"(small, island)",,,,,22.471910,22.471910,0.527137,,,,,,,
694,"(olive, oil)",,,,,11.235955,11.235955,0.521160,,,,,,,


In [133]:
new = w10p_orig.merge(w10p_tag, on='mwe', how='right').drop_duplicates()
new = new[np.isnan(new.cosim_10w_x)]
new

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y
40,"(see, fit)",,,,,,,,900.0,,,,,900.000000,0.164587
41,"(take, leave)",,,,,,,,800.0,,,,,800.000000,0.313061
42,"(have, control)",,,,,,,,800.0,,,,,800.000000,0.102348
43,"(have, faith)",,,,,,,,800.0,,,,,800.000000,0.206265
44,"(cause, injury)",,,,,,,,800.0,,,,,800.000000,0.366189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,"(retail, outlet)",,,,,,,,,,,,325.842697,325.842697,0.426785
173,"(big, issue)",,,,,,,,,,,,325.842697,325.842697,0.091107
174,"(full, training)",,,,,,,,,,,,123.595506,123.595506,0.263529
175,"(big, fish)",,,,,,,,,,,,89.887640,89.887640,0.267810


In [134]:
lost_ng = lost.merge(ngram, on='mwe', how='left').drop_duplicates()

In [135]:
lost_ng

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y,freq,poisson,len,ngram_tag
0,"(set, foot)",1000.0,,,,,1000.000000,0.008091,,,,,,,,40,-2036.929515,2,"(set|VBD, foot|NN)"
1,"(set, foot)",1000.0,,,,,1000.000000,0.008091,,,,,,,,58,-2987.779195,2,"(set|VBN, foot|NN)"
2,"(set, foot)",1000.0,,,,,1000.000000,0.008091,,,,,,,,95,-4577.321101,2,"(set|VB, foot|NN)"
3,"(shake, hands)",1000.0,,,,,1000.000000,0.336349,,,,,,,,62,-2818.844870,2,"(shake|VB, hands|NNS)"
4,"(give, birth)",900.0,,,,,900.000000,0.372657,,,,,,,,116,-5511.576947,2,"(give|VBP, birth|NN)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734,"(early, version)",,,,,22.471910,22.471910,0.344406,,,,,,,,299,-15942.867054,2,"(early|JJ, version|NN)"
735,"(small, island)",,,,,22.471910,22.471910,0.527137,,,,,,,,421,-21581.909948,2,"(small|JJ, island|NN)"
736,"(olive, oil)",,,,,11.235955,11.235955,0.521160,,,,,,,,172,-7969.712469,2,"(olive|NN, oil|NN)"
737,"(olive, oil)",,,,,11.235955,11.235955,0.521160,,,,,,,,242,-10817.437945,2,"(olive|JJ, oil|NN)"


In [142]:
cutoff = ngram.iloc[499999].poisson  # Poisson for evaluated ngrams were all this or above

In [185]:
cutoff2 = ngram.iloc[499999+58532].poisson  # Approximation of cutoff without proper nouns


In [186]:
cutoff2

-1750.6127623282243

In [144]:
lost_ng[lost_ng.poisson >= cutoff]

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y,freq,poisson,len,ngram_tag
581,"(cutting, edge)",,,798.148167,,,798.148167,0.368593,,,,,,,,27,-1151.063357,2,"(cutting|JJ, edge|NN)"


In [187]:
lost_ng[lost_ng.poisson >= cutoff2]

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y,freq,poisson,len,ngram_tag
111,"(tax, haven)",,1000.0,,,,1000.0,0.287075,,,,,,,,36,-1681.658395,2,"(tax|NN, haven|NN)"
513,"(back, door)",,0.0,,,,0.0,0.510744,,,,,,,,32,-1710.707878,2,"(back|RB, door|NN)"
581,"(cutting, edge)",,,798.148167,,,798.148167,0.368593,,,,,,,,27,-1151.063357,2,"(cutting|JJ, edge|NN)"


In [145]:
new_ng = new.merge(ngram, on='mwe', how='left').drop_duplicates()

In [146]:
new_ng

Unnamed: 0,mwe,MC_VN_x,F_ENC_x,R_ENC_x,MC_VPC_x,D_ADJN_x,REF_x,cosim_10w_x,MC_VN_y,F_ENC_y,R_ENC_y,MC_VPC_y,D_ADJN_y,REF_y,cosim_10w_y,freq,poisson,len,ngram_tag
0,"(see, fit)",,,,,,,,900.0,,,,,900.000000,0.164587,34,-1598.636530,2,"(see|VBP, fit|JJ)"
1,"(take, leave)",,,,,,,,800.0,,,,,800.000000,0.313061,27,-1375.915654,2,"(take|VB, leave|NN)"
2,"(have, control)",,,,,,,,800.0,,,,,800.000000,0.102348,20,-1155.062179,2,"(have|VBP, control|NN)"
3,"(have, control)",,,,,,,,800.0,,,,,800.000000,0.102348,57,-3162.753864,2,"(have|VB, control|NN)"
4,"(have, faith)",,,,,,,,800.0,,,,,800.000000,0.206265,28,-1530.354543,2,"(have|VBP, faith|NN)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,"(retail, outlet)",,,,,,,,,,,,325.842697,325.842697,0.426785,30,-1416.984286,2,"(retail|JJ, outlet|NN)"
150,"(big, issue)",,,,,,,,,,,,325.842697,325.842697,0.091107,23,-1212.001700,2,"(big|JJ, issue|NN)"
151,"(full, training)",,,,,,,,,,,,123.595506,123.595506,0.263529,29,-1589.395661,2,"(full|JJ, training|NN)"
152,"(big, fish)",,,,,,,,,,,,89.887640,89.887640,0.267810,24,-1241.839313,2,"(big|JJ, fish|NN)"


In [188]:
new_ng.poisson.mean()

-1600.413967569273

In [189]:
lost_ng.poisson.mean()

-22361.1718501027

In [147]:
ngram

Unnamed: 0,mwe,freq,poisson,len,ngram_tag
0,"(311.22, 500.86)",20,-6.670842e+02,2,"(311.22|CD, 500.86|CD)"
1,"(Makhaya, Ntini)",20,-6.670842e+02,2,"(Makhaya|NNP, Ntini|NNP)"
2,"(resting_place_coordinates, burial_place)",20,-6.670842e+02,2,"(resting_place_coordinates|NNS, burial_place|VBP)"
3,"(Dechawat, Poomjaeng)",20,-6.670842e+02,2,"(Dechawat|NNP, Poomjaeng|NNP)"
4,"(MSC1, MSC2)",20,-6.670842e+02,2,"(MSC1|NN, MSC2|NN)"
...,...,...,...,...,...
1859185,"(on, the)",473024,-2.576335e+07,2,"(on|IN, the|DT)"
1859186,"(and, the)",492576,-2.778679e+07,2,"(and|CC, the|DT)"
1859187,"(to, the)",714100,-3.884106e+07,2,"(to|IN, the|DT)"
1859188,"(in, the)",1388181,-7.588717e+07,2,"(in|IN, the|DT)"


In [149]:
ngram.freq.corr(ngram.poisson, method = 'pearson')

-0.9988625470563577

In [150]:
# Remove POS tags from tuples for matching

import re

pipematcher = re.compile("(.*)\|")
pipematch2 = re.compile("(\|[A-Z]{2,4}\$?|[\$\:,\.\"]|``|\-LRB\-?|\-RRB\-?)")

def g1(matchobj):
    return matchobj.group(1)

def tup_matcher(tupstr, pattern=pipematcher):
    ot = []
    for w in tupstr:
        if w == '|HYPH': w = '-|HYPH'
        if re.match(pattern, w):
            ot.append(g1(pattern.match(w)))
    return tuple(ot)


def tup_rem(tupstr, pattern=pipematch2):
    ot = []
    for w in tupstr:
        if w == '|HYPH': w = '-|HYPH'
        ot.append(re.sub(pattern,'',w))
    return tuple(ot)



In [179]:

def tup_tags(tupstr, pattern=pipematch2):
    ot = []
    for w in tupstr:
        if w == '|HYPH': w = '-|HYPH'
        if re.search(pattern, w):
            ot.append(g1(pattern.search(w)))
    return tuple(ot)

In [180]:
pipematch2 = re.compile("(\|[A-Z]{2,4}\$?|[\$\:,\.\"]|``|\-LRB\-?|\-RRB\-?)")

testtup = ngram.iloc[2].ngram_tag
print(testtup)
tup_tags(testtup, pattern=pipematch2)

('resting_place_coordinates|NNS', 'burial_place|VBP')


('|NNS', '|VBP')

In [181]:
w10p_w2v['taglist'] = w10p_w2v.ngram_tag.apply(tup_tags)

In [183]:
w10p_w2v.taglist.value_counts()

(|NNP, |NNP)           58532
(|JJ, |NN)             19048
(|NN, |NN)             15187
(|IN, |NNP)            13136
(|NN, |IN)             11329
                       ...  
(|JJ, |WP, |VBP)           1
(|HYPH, |VBG, |NNP)        1
(|PRP, |CC, |VBD)          1
(|IN, |JJS, |DT)           1
(|JJ, |NN, |RBR)           1
Name: taglist, Length: 6218, dtype: int64

In [184]:
w10p_w2v

Unnamed: 0,mwe,freq,poisson,len,batch,stopwords,component_cosims,cosim_10w,ngram_tag,taglist
0,"(delivered, from)",28,-1567.428918,2,5,"[0, 1]","[-0.2535712, nan]",-0.253571,"(delivered|VBD, from|IN)","(|VBD, |IN)"
1,"(then, had, two)",20,-1586.812073,3,3,"[0, 1, 1]","[-0.24625674, nan, nan]",-0.246257,"(then|RB, had|VBD, two|CD)","(|RB, |VBD, |CD)"
2,"(and, finally, for)",20,-1647.756550,3,4,"[1, 0, 1]","[nan, -0.22293654, nan]",-0.222937,"(and|CC, finally|RB, for|IN)","(|CC, |RB, |IN)"
3,"(one, before, it)",20,-1587.727730,3,7,"[1, 0, 1]","[nan, -0.21940775, nan]",-0.219408,"(one|CD, before|IN, it|PRP)","(|CD, |IN, |PRP)"
4,"(However, in, their)",20,-1642.620348,3,3,"[0, 1, 1]","[-0.2108197, nan, nan]",-0.210820,"(However|RB, in|IN, their|PRP$)","(|RB, |IN, |PRP$)"
...,...,...,...,...,...,...,...,...,...,...
482425,"(środek.svgFile, Wygaszony, środek.svgFile)",36,-1588.358559,3,1,"[0, 0, 0]","[0.9717464, 0.9375481, 0.9717464]",0.960347,"(środek.svgFile|NNP, Wygaszony|NNP, środek.svg...","(., |NNP, .)"
482426,"(Recurvirostridae, is)",25,-1276.332579,2,4,"[0, 1]","[0.96257687, nan]",0.962577,"(Recurvirostridae|NNP, is|VBZ)","(|NNP, |VBZ)"
482427,"(OUL3, CRO1, CRO2)",36,-1515.104871,3,2,"[0, 0, 0]","[0.94491136, 0.97224206, 0.97286904]",0.963341,"(OUL3|NN, CRO1|NN, CRO2|NN)","(|NN, |NN, |NN)"
482428,"(Lapu, -)",22,-1127.048317,2,2,"[0, 1]","[0.9648907, nan]",0.964891,"(Lapu|NNP, |HYPH)","(|NNP, |HYPH)"
