# Synopsis

We use some pretrained word vectors from [the developers of GloVe](https://nlp.stanford.edu/projects/glove/).

# Configuration

In [304]:
db_file = '../../data/glove.db'

# Libraries

In [305]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity

# Process

## Import GloVe data

In [359]:
with sqlite3.connect(db_file) as db:
    glove = pd.read_sql("SELECT * FROM glove200", db, index_col='term_str')

In [360]:
glove.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.071549,0.093459,0.023738,-0.090339,0.056123,0.32547,-0.39796,-0.092139,0.061181,-0.1895,...,0.1218,0.19957,-0.20303,0.34474,-0.24328,0.13139,-0.0088767,0.33617,0.030591,0.25577
",",0.17651,0.29208,-0.0020768,-0.37523,0.0049139,0.23979,-0.28893,-0.014643,-0.10993,0.15592,...,-0.32582,0.19153,-0.15469,-0.14679,0.046971,0.032325,-0.22006,-0.20774,-0.23189,-0.10814
.,0.12289,0.58037,-0.069635,-0.50288,0.10503,0.39945,-0.38635,-0.084279,0.12219,0.080312,...,-0.035236,0.17688,-0.0536,0.0070031,-0.033006,-0.080021,-0.24451,-0.039174,-0.16236,-0.096652
of,0.052924,0.25427,0.31353,-0.35613,0.029629,0.51034,-0.10716,0.15195,0.057698,0.06149,...,-0.040886,0.3894,-0.10509,0.23372,0.096027,-0.30324,0.24488,-0.086254,-0.41917,0.46496
to,0.57346,0.5417,-0.23477,-0.3624,0.4037,0.11386,-0.44933,-0.30991,-0.0053411,0.58426,...,-0.27915,0.43742,-0.31237,0.13194,-0.33278,0.18877,-0.23422,0.54418,-0.23069,0.34947


### Convert cells to floats

This should have been done when the text files were imported into the database.

In [361]:
glove = glove.astype('float')

## Remove non-words

There are a lot of useless tokens in the vocabulary. These may be good for generating the features, but we don't need them in our queries.

In [362]:
glove = glove.reset_index()
glove = glove[glove.term_str.str.match(r'^[a-z]+$')]
glove = glove.set_index('term_str')

In [363]:
glove.shape

(317756, 200)

## Define some semantic functions

In [311]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    wv = glove.loc[term_str].values.reshape(-1, 1).T
    return wv

def get_sims(term_str, n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score']).sort_values('score',ascending=False).head(n)

def get_nearest_vector(wv):
    """Get the nearest word vector to a given word vector"""
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score']).sort_values('score',ascending=False).head(2).iloc[1]

def get_analogy(a, b, d):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        D = get_word_vector(d)
        C = np.add(np.subtract(A, B), D)
        X = get_nearest_vector(C)
        return X.name
    except ValueError as e:
        print(e)
        return None

## Test similarity function

In [345]:
QUEEN = get_sims('queen')

In [346]:
QUEEN

Unnamed: 0_level_0,score
term_str,Unnamed: 1_level_1
queen,1.0
princess,0.794724
king,0.750769
elizabeth,0.735571
royal,0.706503
lady,0.70448
victoria,0.685376
monarch,0.668326
crown,0.668056
prince,0.664051


## Test analogy function

### Are cats female?

In [364]:
get_analogy('dog','male','female')

'dogs'

In [365]:
get_analogy('bird','tree','man')

'man'

### Kings and Queens

In [366]:
get_analogy('king','male','female')

'queen'

In [367]:
get_analogy('queen','female','male')

'king'

In [368]:
get_analogy('queen','king','female')

'male'

### Left and right

In [372]:
get_analogy('left','right','sin')

'binned'

In [370]:
get_analogy('left','right','female')

'male'

In [371]:
get_analogy('male','right','female')

'male'