# Synopsis

We use some pretrained word vectors from [the developers of GloVe](https://nlp.stanford.edu/projects/glove/).

# Configuration

In [304]:
db_file = '../../data/glove.db'

# Libraries

In [305]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity

# Process

## Import GloVe data

In [3]:
with sqlite3.connect(db_file) as db:
    glove = pd.read_sql("SELECT * FROM glove50", db, index_col='term_str')

### Convert cells to floats

This should have been done when the text files were imported into the database.

In [200]:
glove = glove.astype('float')

## Remove non-words

There are a lot of useless tokens in the vocabulary. These may be good for generating the features, but we don't need them in our queries.

In [201]:
glove = glove.reset_index()
glove = glove[glove.term_str.str.match(r'^[a-z]+$')]
glove = glove.set_index('term_str')

## Define some semantic functions

In [308]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    wv = glove.loc[term_str].values.reshape(-1, 1).T
    return wv

In [309]:
def get_sims(term_str, n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score']).sort_values('score',ascending=False).head(n)

In [310]:
def get_nearest_vector(wv):
    """Get the nearest word vector to a given word vector"""
    sims = cosine_similarity(glove.values, wv)
    return pd.DataFrame(sims, index=glove.index, columns=['score']).sort_values('score',ascending=False).head(2).iloc[1]

In [311]:
def get_analogy(a, b, d):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        D = get_word_vector(d)
        C = np.add(np.subtract(A, B), D)
        X = get_nearest_vector(C)
        return X.name
    except ValueError as e:
        print(e)
        return None

## Test similarity function

In [312]:
QUEEN = get_sims('queen')

In [313]:
QUEEN

Unnamed: 0_level_0,score
term_str,Unnamed: 1_level_1
queen,1.0
princess,0.851517
lady,0.805061
elizabeth,0.787304
king,0.783904
prince,0.782186
coronation,0.769278
consort,0.76261
royal,0.744286
crown,0.738265


## Test analogy function

### Are cats female?

In [315]:
get_analogy('dog','male','female')

'cat'

In [331]:
get_analogy('bird','fly','dog')

'cat'

### Kings and Queens

In [316]:
get_analogy('king','male','female')

'prince'

In [317]:
get_analogy('queen','female','male')

'princess'

In [318]:
get_analogy('queen','king','female')

'male'

### Left and right

In [322]:
get_analogy('left','right','bad')

'worse'

In [323]:
get_analogy('left','right','female')

'male'

In [324]:
get_analogy('male','right','female')

'male'