# Synopsis

We create word vectors by applying a singular value decomposition to a pointwise mutual information word-word matrix. 

# Configuration

In [1]:
proj = '/Users/rca2t/Dropbox/Courses/DSI/DS5559/UVA_DSI_REPO'
pwd = '{}/play/wordembedding'.format(proj)
db_file = "{}/data/novels.db".format(proj)
lib = "{}/lib".format(proj)

# Word Embedding
window = 3

# Libraries

In [82]:
import pandas as pd
import numpy as np
import scipy as sp
import sqlite3
import sys; sys.path.append(lib)
from textman import textman as tx

# Pragmas

In [3]:
%matplotlib inline

# Process

## Extract skipgrams from tokens with SQL

We grab a sliding bag of tokens without stopwords and proper nouns.

### Build SQL query from configs

In [4]:
in_clause = ', '.join(['x.token_num + {0}, x.token_num - {0}'.format(i) for i in range(1, window + 1)])
pos_clause = "AND pos NOT LIKE 'NNP%' " # Remove proper nouns

In [5]:
sql = """
WITH mytoken(author, book, chapter, para_num, sent_num,token_num,term_str,term_id) 
AS (
    SELECT author, book, chapter, para_num, sent_num,token_num,term_str,term_id
    FROM token 
    WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0) 
        AND term_str is not NULL
        {}       
)

SELECT x.term_str as target, y.term_str as probe, (y.token_num - x.token_num) AS dist
FROM mytoken x 
JOIN mytoken y USING(author, book, chapter, para_num, sent_num)
WHERE y.token_num IN ({})
ORDER BY target, dist, probe
""".format(pos_clause, in_clause)

In [6]:
print(sql)


WITH mytoken(author, book, chapter, para_num, sent_num,token_num,term_str,term_id) 
AS (
    SELECT author, book, chapter, para_num, sent_num,token_num,term_str,term_id
    FROM token 
    WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0) 
        AND term_str is not NULL
        AND pos NOT LIKE 'NNP%'        
)

SELECT x.term_str as target, y.term_str as probe, (y.token_num - x.token_num) AS dist
FROM mytoken x 
JOIN mytoken y USING(author, book, chapter, para_num, sent_num)
WHERE y.token_num IN (x.token_num + 1, x.token_num - 1, x.token_num + 2, x.token_num - 2, x.token_num + 3, x.token_num - 3)
ORDER BY target, dist, probe



### Pull from DB

In [7]:
skipgrams = tx.get_sql(sql, db_file)

In [8]:
skipgrams.head(10)

Unnamed: 0,target,probe,dist
0,aback,said,-3
1,aback,completely,-2
2,aback,considerably,-2
3,aback,little,-2
4,aback,utterly,-2
5,aback,taken,-1
6,aback,taken,-1
7,aback,taken,-1
8,aback,taken,-1
9,aback,taken,-1


### Add Skigram weights (as GloVe does)

In [9]:
# skipgrams['glove_weight'] = np.round(np.abs(1 / skipgrams['dist']), 2)

In [10]:
# skipgrams.head(10)

## Get Unigram Probabilities

We have already computed these in the vocab table.

### Import vocab table

In [11]:
vocab = tx.get_table('vocab', db_file, index_col=['term_id'])
vocab = vocab[vocab.stop == 0]

In [12]:
vocab.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop,df,idf,tfidf_sum,tfidf_mean,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20770,said,6370,0.004245,said,0,308,0.016599,105.737298,0.330429,2.191103
15714,mr,2860,0.001906,mr,0,196,0.212894,608.876574,1.902739,48.326917
24409,time,2615,0.001743,time,0,317,0.004091,10.697223,0.033429,0.224989
14660,man,2383,0.001588,man,0,294,0.036803,87.70071,0.274065,2.355369
13584,know,2302,0.001534,know,0,299,0.029479,67.860175,0.212063,0.9728


### Get $P(x)$

In [13]:
p_x = vocab[['term_str','p']].reset_index().set_index('term_str')['p']

In [14]:
p_x.sort_values(ascending=False).head()

term_str
said    0.004245
mr      0.001906
time    0.001743
man     0.001588
know    0.001534
Name: p, dtype: float64

In [15]:
# skipgrams.groupby('target').target.count() / skipgrams.target.sum()

## Compute Normalized PMI for Skipgrams

### PMI

$log \dfrac{P(x,y)}{P(x)P(y)}$

### NMPI

$\dfrac{log\dfrac{P(x,y)}{P(x)P(y)}}{-log P(x,y)}$

See [G. Bouma 2009, eq. 7](https://pdfs.semanticscholar.org/1521/8d9c029cbb903ae7c729b2c644c24994c201.pdf)

### Create compressed skipgram table

In [16]:
skipgrams2 = skipgrams.groupby(['target','probe']).probe.count()\
    .to_frame().rename(columns={'probe':'n'})\
    .reset_index().set_index(['target','probe'])

In [18]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
aback,completely,1
aback,considerably,1
aback,kept,1
aback,little,1
aback,murmured,1
aback,said,1
aback,something,1
aback,taken,8
aback,utterly,1
abaft,aftest,1


### Compute $P(x,y)$

In [19]:
N = skipgrams2.n.sum()

In [20]:
skipgrams2['p_xy'] = skipgrams2.n / N

In [21]:
skipgrams2.head(10)

### Compute $PMI(x;y)$

In [22]:
skipgrams2['pmi_xy'] = skipgrams2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [23]:
skipgrams2.sort_values('pmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sonne,twentie,1,1e-06,14.635063
arca,nummos,1,1e-06,14.635063
rearranged,deft,1,1e-06,14.635063
petit,beau,1,1e-06,14.635063
growths,mushroom,1,1e-06,14.635063
touters,skeer,1,1e-06,14.635063
petite,pauvre,1,1e-06,14.635063
unwept,unwatched,1,1e-06,14.635063
unwept,uncared,1,1e-06,14.635063
unwept,bereft,1,1e-06,14.635063


In [24]:
skipgrams2['npmi_xy'] = skipgrams2.pmi_xy / -( np.log(skipgrams2.p_xy) )

In [27]:
skipgrams2.sort_values('npmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
computer,virus,4,4e-06,13.941916,1.122434
virus,computer,4,4e-06,13.941916,1.122434
ihre,unvollkommen,2,2e-06,14.635063,1.115962
fracto,nunc,2,2e-06,14.635063,1.115962
nunc,fracto,2,2e-06,14.635063,1.115962


### Keep only positives

Changed since lab.

In [38]:
skipgrams2.loc[skipgrams2.npmi_xy < 0, 'pnpmi_xy'] = 0
skipgrams2.loc[skipgrams2.npmi_xy >= 0, 'pnpmi_xy'] =  skipgrams2.npmi_xy

In [40]:
skipgrams2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aback,completely,1,1e-06,8.068391,0.584351,0.584351
aback,considerably,1,1e-06,8.800252,0.637356,0.637356
aback,kept,1,1e-06,6.788864,0.491681,0.491681
aback,little,1,1e-06,4.887703,0.35399,0.35399
aback,murmured,1,1e-06,8.248184,0.597372,0.597372


## Create PNPMI Matrix

In [76]:
SGM = skipgrams2.npmi_xy.unstack().fillna(0)

In [77]:
SGM.head()

probe,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,abated,...,zigzagged,zone,zoology,zoöphagous,zoöphagy,zum,zusammen,à,ça,émeutes
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
SGM.loc['war'].sort_values(ascending=False).head()

probe
intestine       0.751217
dissipations    0.751217
informal        0.751217
devastations    0.751217
signalize       0.751217
Name: war, dtype: float64

In [80]:
skipgrams2.loc['intestine'].sort_values('n', ascending=False)

Unnamed: 0_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
consequence,1,1e-06,9.66525,0.700003,0.700003
war,1,1e-06,10.372383,0.751217,0.751217


## SVD

In [81]:
sparse = sp.sparse.csr_matrix(SGM.values)

In [83]:
SVD = sp.sparse.linalg.svds(sparse, k=256)

In [84]:
U, S, V = SVD

In [227]:
U.shape, S.shape, V.shape

((24290, 256), (256,), (256, 24290))

In [85]:
word_vecs = U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

In [86]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [87]:
WE.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,7.715719e-15,0.040194,-2.137387e-14,-0.020119,0.075912,-0.052124,6.505764e-14,-6.22002e-14,-8.544396e-15,0.071704,...,-5.1524e-16,0.053758,-0.027694,3.907237e-15,-0.014615,0.045539,-0.072116,-0.047603,-0.021114,-0.040583
abaft,3.217065e-14,0.048869,-2.499445e-14,-0.02325,0.006422,-0.159226,2.079755e-13,8.049651e-15,2.891096e-14,-0.022019,...,-1.090057e-15,-0.002207,0.052773,-7.153498e-15,0.003823,-0.00814,0.026981,-0.050486,0.05601,-0.014483
abandon,-3.133372e-14,-0.077616,1.568804e-14,0.149301,-0.212866,0.077754,-1.017711e-13,1.526976e-14,-1.192818e-14,0.027305,...,-6.062371e-16,0.050581,-0.002287,-1.694287e-16,0.039436,-0.014823,0.027164,0.048715,-0.090397,-0.06733
abandoned,6.078035e-15,-0.067577,2.719503e-14,0.06094,-0.247509,-0.028677,5.133289e-14,3.403554e-14,5.954586e-15,-0.011379,...,-1.147347e-15,0.065895,0.111213,-1.437089e-14,-0.024864,-0.006603,-0.021847,0.014058,-0.077565,-0.077343
abandoning,-9.582971e-14,-0.033249,5.514695e-14,-0.068586,-0.051109,0.108353,-1.291942e-13,5.265265e-14,-3.730673e-15,-0.052378,...,-2.821518e-16,0.035204,0.121501,-1.61767e-14,0.031339,0.078384,0.024014,0.031338,-0.048161,-0.050376


In [88]:
def word_sims(word, n=10):
    try:
        sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
        return sims
    except KeyError as e:
        print('Word "{}" not in vocabulary.'.format(word))
        return None

In [89]:
print(word_sims('happy'))

[['grovel' 0.6401623154332218]
 ['transit' 0.6401623154332218]
 ['anniversary' 0.6401623154332218]
 ['prosperous' 0.6041280521921256]
 ['consoles' 0.589961338701885]
 ['supremely' 0.589961338701885]
 ['recurring' 0.589961338701885]
 ['jocund' 0.5397603619705483]
 ['flights' 0.5235992571699444]
 ['descendants' 0.5235992571699444]]


In [90]:
def word_sim_report(word):
    sims = word_sims(word)
    for sim_word, score in sims:
        context = ' '.join(skipgrams2.loc[sim_word].index.values.tolist()[:5])
        print("{} ({}) {}".format(sim_word.upper(), score, context))
        print('-'*80)

In [205]:
word_sim_report('woman')

LAUNDRY (0.6018448679106666) brought folded nobody room woman
--------------------------------------------------------------------------------
KINSHIP (0.6009973565882161) dreadful sense woman
--------------------------------------------------------------------------------
REFORMED (0.6009973565882161) woman
--------------------------------------------------------------------------------
MYSTIFY (0.6009973565882161) business honest woman
--------------------------------------------------------------------------------
SILHOUETTED (0.6009973565882161) woman yellow
--------------------------------------------------------------------------------
UNMENTIONABLE (0.6009973565882161) presence stood woman
--------------------------------------------------------------------------------
GRABS (0.6009973565882161) baby married woman
--------------------------------------------------------------------------------
SHOD (0.6009973565882161) elderly slip woman
-----------------------------------------

In [92]:
word_sim_report('man')

YOUNG (0.5772167789397364) abandon abject absence absolutely accepted
--------------------------------------------------------------------------------
LEGGED (0.5487605823908175) alone cross friend game high
--------------------------------------------------------------------------------
TALL (0.5348549377696777) admit age along anchor art
--------------------------------------------------------------------------------
PIEBALD (0.5342342897669777) complexion hair man twice
--------------------------------------------------------------------------------
OLD (0.531586346825827) abbey able abruptly accent account
--------------------------------------------------------------------------------
BEARDED (0.5110802281230045) background bargain black chin efficient
--------------------------------------------------------------------------------
TALLIES (0.4967572282682332) idea man
--------------------------------------------------------------------------------
REPELLANT (0.4967572282682332) m

In [93]:
word_sim_report('young')

LADY (0.6516796047854232) abbess abruptly absence accept accommodate
--------------------------------------------------------------------------------
LADIES (0.6175629322254406) absence accommodation addressed advice age
--------------------------------------------------------------------------------
CRATCHITS (0.5929933764406247) became chairs danced fetch got
--------------------------------------------------------------------------------
MAN (0.5772167789397364) abandons abhorrence aboard abominable abruptly
--------------------------------------------------------------------------------
LADYS (0.5703715029747964) advice affections alarm annoyance appeared
--------------------------------------------------------------------------------
SMIRKING (0.5564600455343252) rejoined young
--------------------------------------------------------------------------------
GENTLEMANLIKE (0.5564600455343252) man young
--------------------------------------------------------------------------------

## Define some semantic functions

Added after lecture.

In [225]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    return SGM.loc[term_str].values.reshape(-1, 1).T

def get_nearest_vector(wv, method='cosine', n=1):
    """Get the nearest word vectors to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        eds = euclidean_distances(SGM.values, wv)
        sims = 1 - (eds/eds.max())
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n+1).iloc[1:]

def get_sims(term_str, method='cosine', n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims =  get_nearest_vector(wv, method=method, n=n) 
    return sims

def get_analogy(a, b, d, method='cosine'):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        D = get_word_vector(d)
        C = np.add(np.subtract(A, B), D)
        X = get_nearest_vector(C, method=method, n=1)
        return X.iloc[0].name
    except ValueError as e:
        print(e)
        return None

In [208]:
get_nearest_vector(get_word_vector('woman'),  n=10)

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
man,0.1935
young,0.158966
girl,0.157475
dear,0.154007
poor,0.152012
lady,0.151735
old,0.149467
father,0.148864
friend,0.147037
said,0.140813


In [161]:
def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, n=1, method=method)
    return X
#     return X.iloc[0].name

In [204]:
get_sims('woman')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
man,0.1935
young,0.158966
girl,0.157475
dear,0.154007
poor,0.152012
lady,0.151735
old,0.149467
father,0.148864
friend,0.147037
said,0.140813


In [181]:
test = get_nearest_vector(get_word_vector('king'), n=10)

In [182]:
test

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
stamen,0.259207
intuitions,0.256672
castor,0.207423
facilities,0.205126
cigargirl,0.203362
meted,0.19337
runners,0.188342
blessedness,0.187533
spheres,0.187148
blankets,0.187052


In [153]:
get_sims('love')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
affection,0.161329
loved,0.15509
pity,0.142006
tenderness,0.140762
felt,0.13161
esteem,0.131065
heart,0.12982
fear,0.129634
gratitude,0.124463
aback,


In [213]:
get_opposite('man','beard')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
woman,0.172185


In [224]:
get_analogy('man','boy','girl')

'girl'

In [221]:
get_analogy('king', 'male', 'female')

'king'

In [66]:
euclidean_distances?

[0;31mSignature:[0m
[0meuclidean_distances[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mY[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mY_norm_squared[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msquared[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mX_norm_squared[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.

For efficiency reasons, the euclidean distance between a pair of row
vector x and y is computed as::

    dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))

This formulation has two advantages over other ways of computing distances.
First, it is computationally efficient when dealing with sparse data.
Second, if one argument varies but the other remains unch

# Save

In [52]:
with sqlite3.connect(db_file) as db:
    tx.put_to_db(db, skipgrams, 'skipgrams', index=False, if_exists='replace')
    tx.put_to_db(db, WE, 'wordembeddings', index=True, if_exists='replace')