# Semantic Similarity

## Using GloVe Embeddings

In [1]:
from scipy.spatial.distance import cosine, euclidean
import numpy as np
from gensim.models import KeyedVectors

In [2]:
# load the Stanford GloVe model
filename = 'data/glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [3]:
def get_mean_vector(word2vec_model, words):
    if len(words) >= 1:
        words = words.split(' ')
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [4]:
mean1 = get_mean_vector(model, 'he is fitness freak')
mean2 = get_mean_vector(model, 'he love gym')

In [5]:
1 - cosine(mean1, mean2)

0.698372483253479

In [6]:
euclidean(mean1, mean2)

3.176651954650879

## Using spaCy

In [7]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [19]:
doc1 = nlp("he is fitness freak")
doc2 = nlp("he love gym")
print(doc1.similarity(doc2)) # High score is more close

0.8354100620391012


In [23]:
# Customized similarity method

def get_mean_vector_spacy(word2vec_model, words):
    if len(words) >= 1:
        add = 0
        for word in words:
            add = add + word2vec_model(words).vector
            return add ## add/len(words) 
    else:
        return []

In [93]:
vec1 = get_mean_vector(nlp, 'he is fitness freak')
vec2 = get_mean_vector(nlp, 'he love gym')

In [94]:
1 - cosine(vec1, vec2)

0.8354101181030273