# Synopsis

We create word vectors by applying a singular value decomposition to a pointwise mutual information word-word matrix. 

# Configuration

In [8]:
proj = '/Users/rca2t/Dropbox/Courses/DSI/DS5559/UVA_DSI_REPO'
pwd = '{}/play/wordembedding'.format(proj)
db_file = "{}/data/novels.db".format(proj)
lib = "{}/lib".format(proj)

# Word Embedding
window = 3

# Libraries

In [9]:
import pandas as pd
import numpy as np
import sqlite3
import sys; sys.path.append(lib)
from textman import textman as tx

# Pragmas

In [10]:
%matplotlib inline

# Process

## Extract skipgrams from tokens with SQL

We grab a sliding bag of tokens without stopwords and proper nouns.

### Build SQL query from configs

In [11]:
in_clause = ', '.join(['x.token_num + {0}, x.token_num - {0}'.format(i) for i in range(1, window + 1)])
pos_clause = "AND pos NOT LIKE 'NNP%' " # Remove proper nouns

In [12]:
sql = """
WITH mytoken(author, book, chapter, para_num, sent_num,token_num,term_str,term_id) 
AS (
    SELECT author, book, chapter, para_num, sent_num,token_num,term_str,term_id
    FROM token 
    WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0) 
        AND term_str is not NULL
        {}       
)

SELECT x.term_str as target, y.term_str as probe, (y.token_num - x.token_num) AS dist
FROM mytoken x 
JOIN mytoken y USING(author, book, chapter, para_num, sent_num)
WHERE y.token_num IN ({})
ORDER BY target, dist, probe
""".format(pos_clause, in_clause)

In [13]:
print(sql)


WITH mytoken(author, book, chapter, para_num, sent_num,token_num,term_str,term_id) 
AS (
    SELECT author, book, chapter, para_num, sent_num,token_num,term_str,term_id
    FROM token 
    WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0) 
        AND term_str is not NULL
        AND pos NOT LIKE 'NNP%'        
)
    
SELECT x.term_str as target, y.term_str as probe, (y.token_num - x.token_num) AS dist
FROM mytoken x 
JOIN mytoken y USING(author, book, chapter, para_num, sent_num)
WHERE y.token_num IN (x.token_num + 1, x.token_num - 1, x.token_num + 2, x.token_num - 2, x.token_num + 3, x.token_num - 3)
ORDER BY target, dist, probe



### Pull from DB

In [14]:
skipgrams = tx.get_sql(sql, db_file)

In [42]:
skipgrams.head(10)

Unnamed: 0,target,probe,dist
0,aback,said,-3
1,aback,completely,-2
2,aback,considerably,-2
3,aback,little,-2
4,aback,utterly,-2
5,aback,taken,-1
6,aback,taken,-1
7,aback,taken,-1
8,aback,taken,-1
9,aback,taken,-1


### Add Skigram weights (as GloVe does)

In [16]:
# skipgrams['glove_weight'] = np.round(np.abs(1 / skipgrams['dist']), 2)

In [17]:
# skipgrams.head(10)

## Get Unigram Probabilities

We have already computed these in the vocab table.

### Import vocab table

In [18]:
vocab = tx.get_table('vocab', db_file, index_col=['term_id'])
vocab = vocab[vocab.stop == 0]

In [19]:
vocab.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop,df,idf,tfidf_sum,tfidf_mean,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20770,said,6370,0.004245,said,0,308,0.016599,105.737298,0.330429,2.191103
15714,mr,2860,0.001906,mr,0,196,0.212894,608.876574,1.902739,48.326917
24409,time,2615,0.001743,time,0,317,0.004091,10.697223,0.033429,0.224989
14660,man,2383,0.001588,man,0,294,0.036803,87.70071,0.274065,2.355369
13584,know,2302,0.001534,know,0,299,0.029479,67.860175,0.212063,0.9728


### Get $P(x)$

In [23]:
p_x = vocab[['term_str','p']].reset_index().set_index('term_str')['p']

In [24]:
p_x.sort_values(ascending=False).head()

term_str
said    0.004245
mr      0.001906
time    0.001743
man     0.001588
know    0.001534
Name: p, dtype: float64

In [25]:
# skipgrams.groupby('target').target.count() / skipgrams.target.sum()

## Compute Normalized PMI for Skipgrams

### PMI

$log \dfrac{P(x,y)}{P(x)P(y)}$

### NMPI

$\dfrac{log\dfrac{P(x,y)}{P(x)P(y)}}{-log P(x,y)}$

See [G. Bouma 2009, eq. 7](https://pdfs.semanticscholar.org/1521/8d9c029cbb903ae7c729b2c644c24994c201.pdf)

### Create compressed skipgram table

In [26]:
skipgrams2 = skipgrams.groupby(['target','probe']).probe.count()\
    .to_frame().rename(columns={'probe':'n'})\
    .reset_index().set_index(['target','probe'])

In [28]:
# skipgrams2['glove_weight_sum'] = skipgrams.groupby(['target','probe']).glove_weight.sum()\
#     .to_frame().reset_index().set_index(['target','probe'])

In [29]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
aback,completely,1
aback,considerably,1
aback,kept,1
aback,little,1
aback,murmured,1
aback,said,1
aback,something,1
aback,taken,8
aback,utterly,1
abaft,aftest,1


### Compute $P(x,y)$

In [30]:
N = skipgrams2.n.sum()

In [31]:
skipgrams2['p_xy'] = skipgrams2.n / N

In [32]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1
aback,completely,1,1e-06
aback,considerably,1,1e-06
aback,kept,1,1e-06
aback,little,1,1e-06
aback,murmured,1,1e-06
aback,said,1,1e-06
aback,something,1,1e-06
aback,taken,8,8e-06
aback,utterly,1,1e-06
abaft,aftest,1,1e-06


In [33]:
### Compute $P(x,y)$ using GloVe weighting
# Ngw = skipgrams2.glove_weight_sum.sum()
# skipgrams2['p_xy_gw'] = skipgrams2.glove_weight_sum / Ngw
# skipgrams2.sort_values('p_xy_gw', ascending=False).head(10)

In [34]:
### Compute $p(x)$ using skigram data 
# p_x = skipgrams2.reset_index().target.value_counts() / N
# p_x.head()

### Compute $PMI(x;y)$

In [35]:
skipgrams2['pmi_xy'] = skipgrams2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [36]:
skipgrams2.sort_values('pmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sonne,twentie,1,1e-06,14.635063
arca,nummos,1,1e-06,14.635063
rearranged,deft,1,1e-06,14.635063
petit,beau,1,1e-06,14.635063
growths,mushroom,1,1e-06,14.635063
touters,skeer,1,1e-06,14.635063
petite,pauvre,1,1e-06,14.635063
unwept,unwatched,1,1e-06,14.635063
unwept,uncared,1,1e-06,14.635063
unwept,bereft,1,1e-06,14.635063


In [37]:
skipgrams2['npmi_xy'] = skipgrams2.pmi_xy / -( np.log(skipgrams2.p_xy) )

In [38]:
skipgrams2.sort_values('npmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
computer,virus,4,4e-06,13.941916,1.122434
virus,computer,4,4e-06,13.941916,1.122434
ihre,unvollkommen,2,2e-06,14.635063,1.115962
fracto,nunc,2,2e-06,14.635063,1.115962
nunc,fracto,2,2e-06,14.635063,1.115962


In [39]:
# skipgrams2['pmi_xy_gw'] = skipgrams2.apply(lambda row: np.log2(row.p_xy_gw / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [40]:
# skipgrams2.pmi_xy_gw.sort_values(ascending=False)

## PMI Matrix

In [41]:
SGM = skipgrams2.npmi_xy.unstack().fillna(0)

In [43]:
SGM.head()

probe,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,abated,...,zigzagged,zone,zoology,zoöphagous,zoöphagy,zum,zusammen,à,ça,émeutes
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
SGM.loc['intestine'].sort_values(ascending=False).head()

probe
war            0.751217
consequence    0.700003
émeutes        0.000000
favourable     0.000000
faultiness     0.000000
Name: intestine, dtype: float64

In [33]:
skipgrams2.loc['intestine'].sort_values('n', ascending=False)

Unnamed: 0_level_0,n,glove_weight_sum,p_xy,pmi_xy,npmi_xy
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
consequence,1,0.5,1e-06,9.66525,0.700003
war,1,1.0,1e-06,10.372383,0.751217


## SVD

In [34]:
import scipy as sp #.sparse.linalg.svds(PMI, k=256)

In [35]:
sparse = sp.sparse.csr_matrix(SGM.values)

In [36]:
SVD = sp.sparse.linalg.svds(sparse, k=256)

In [37]:
U, S, V = SVD

In [38]:
# unorm = U / np.sqrt(np.sum(U*U, axis=1, keepdims=True))
# vnorm = V / np.sqrt(np.sum(V*V, axis=0, keepdims=True))

In [39]:
word_vecs = U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

In [40]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [41]:
WE.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,1.845418e-14,0.040194,3.985381e-14,-0.020119,0.075912,-0.052124,1.939879e-14,7.382745e-14,2.859582e-15,-0.071704,...,-6.18288e-16,-0.053758,0.027694,-1.210814e-15,-0.014615,0.045539,-0.072116,-0.047603,0.021114,-0.040583
abaft,4.114266e-14,0.048869,3.830572e-14,-0.02325,0.006422,-0.159226,3.775894e-14,-5.047627e-14,7.763162e-15,0.022019,...,3.179333e-16,0.002207,-0.052773,3.405554e-15,0.003823,-0.00814,0.026981,-0.050486,-0.05601,-0.014483
abandon,-1.564039e-14,-0.077616,-9.029757e-14,0.149301,-0.212866,0.077754,9.504554e-15,1.685816e-14,1.587071e-14,-0.027305,...,-1.018558e-15,-0.050581,0.002287,5.40054e-16,0.039436,-0.014823,0.027164,0.048715,0.090397,-0.06733
abandoned,5.449899e-14,-0.067577,-5.995251e-14,0.06094,-0.247509,-0.028677,2.49076e-14,4.066547e-14,1.119026e-14,0.011379,...,-6.644447e-16,-0.065895,-0.111213,7.494064e-15,-0.024864,-0.006603,-0.021847,0.014058,0.077565,-0.077343
abandoning,-1.19938e-13,-0.033249,-2.200784e-14,-0.068586,-0.051109,0.108353,-4.095903e-14,-6.757927e-14,6.897043e-16,0.052378,...,5.878162000000001e-17,-0.035204,-0.121501,8.033488e-15,0.031339,0.078384,0.024014,0.031338,0.048161,-0.050376


In [42]:
def word_sims(word, n=10):
    try:
        sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
        return sims
    except KeyError as e:
        print('Word "{}" not in vocabulary.'.format(word))
        return None

In [43]:
print(word_sims('happy'))

[['grovel' 0.6401623154332218]
 ['transit' 0.6401623154332218]
 ['anniversary' 0.6401623154332218]
 ['prosperous' 0.6041280521921256]
 ['consoles' 0.589961338701885]
 ['supremely' 0.589961338701885]
 ['recurring' 0.589961338701885]
 ['jocund' 0.5397603619705483]
 ['flights' 0.5235992571699444]
 ['descendants' 0.5235992571699444]]


In [48]:
def word_sim_report(word):
    sims = word_sims(word)
    for sim_word, score in sims:
        context = ' '.join(skipgrams2.loc[sim_word].index.values.tolist()[:5])
        print("{} ({}) {}".format(sim_word.upper(), score, context))
        print('-'*80)

In [49]:
word_sim_report('taste')

DEPRAVE (0.7020430894024823) corrupt distract mind taste
--------------------------------------------------------------------------------
VITIATE (0.7020430894024823) heart taste
--------------------------------------------------------------------------------
DEADEN (0.7020430894024823) cannot taste
--------------------------------------------------------------------------------
ACCORDANT (0.7020430894024823) taste
--------------------------------------------------------------------------------
CONVERSAZIONI (0.7020430894024823) music taste
--------------------------------------------------------------------------------
CHAMPAGNE (0.6553769887460313) administered ammonia bottle brandy dry
--------------------------------------------------------------------------------
MODERATED (0.6518421126711456) already considered simplicity taste wishes
--------------------------------------------------------------------------------
ENTOMOLOGY (0.6518421126711456) devoted identification south taste

In [50]:
word_sim_report('man')

YOUNG (0.5772167789397364) abandon abject absence absolutely accepted
--------------------------------------------------------------------------------
LEGGED (0.5487605823908175) alone cross friend game high
--------------------------------------------------------------------------------
TALL (0.5348549377696777) admit age along anchor art
--------------------------------------------------------------------------------
PIEBALD (0.5342342897669777) complexion hair man twice
--------------------------------------------------------------------------------
OLD (0.531586346825827) abbey able abruptly accent account
--------------------------------------------------------------------------------
BEARDED (0.5110802281230045) background bargain black chin efficient
--------------------------------------------------------------------------------
TALLIES (0.4967572282682332) idea man
--------------------------------------------------------------------------------
REPELLANT (0.4967572282682332) m

In [51]:
word_sim_report('young')

LADY (0.6516796047854232) abbess abruptly absence accept accommodate
--------------------------------------------------------------------------------
LADIES (0.6175629322254406) absence accommodation addressed advice age
--------------------------------------------------------------------------------
CRATCHITS (0.5929933764406247) became chairs danced fetch got
--------------------------------------------------------------------------------
MAN (0.5772167789397364) abandons abhorrence aboard abominable abruptly
--------------------------------------------------------------------------------
LADYS (0.5703715029747964) advice affections alarm annoyance appeared
--------------------------------------------------------------------------------
SMIRKING (0.5564600455343252) rejoined young
--------------------------------------------------------------------------------
GENTLEMANLIKE (0.5564600455343252) man young
--------------------------------------------------------------------------------

## Define some semantic functions

In [70]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    wv = SGM.loc[term_str].values.reshape(-1, 1).T
    return wv

def get_sims(term_str, n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n)

def get_nearest_vector(wv, method='cosine'):
    """Get the nearest word vector to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        sims = euclidean_distances(SGM.values, wv)
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(2).iloc[1]

def get_analogy(a, b, d, method='cosine'):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        D = get_word_vector(d)
        C = np.add(np.subtract(A, B), D)
        X = get_nearest_vector(C, method=method)
        return X.name
    except ValueError as e:
        print(e)
        return None

In [71]:
def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, method=method)
    return X.name

In [73]:
get_opposite('cat','dog', method='euclidean')

'said'

In [74]:
get_analogy('dog', 'male', 'female', method='euclidean')

'said'

# Save

In [52]:
with sqlite3.connect(db_file) as db:
    tx.put_to_db(db, skipgrams, 'skipgrams', index=False, if_exists='replace')
    tx.put_to_db(db, WE, 'wordembeddings', index=True, if_exists='replace')