# Cosine similarities
This notebook illustrates how to calculate and display cosine similarities between wordvectors.
As input, we use a file with embeddings generated by [embiggen](https://pypi.org/project/embiggen/]) together
with a file with the corresponding word labels.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.cluster import DBSCAN


The following code allows us to import the ``kcet`` module from the local repository.

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from kcet import Wordvec2Cosine

The constructor of ``Wordvec2Cosine`` loads the word embeddings and words into a pandas dataframe.

In [None]:
data_directory = 'data'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
embedding_file = os.path.join(data_directory, "embedding_SG_dim100_before2021_jan17.npy")
words_file = os.path.join(data_directory, "words_before2021_jan17.txt")
w2c = Wordvec2Cosine(embeddings=embedding_file, words=words_file)
df = w2c.get_embeddings()
df.head()

## Top n most similar words
We retrieve the top n most similar words. The function ``n_most_similar_words`` returns a list of tuples,
and ``n_most_similar_words_df`` returns a Pandas dataframe.

In [None]:
target_word = 'ncbigene695' #BTK
n = 50
top_cosine_similarities_btk = w2c.n_most_similar_words_df(target_word=target_word, n=n)
top_cosine_similarities_btk.head()

target_word ='meshd007938' # Leukemia Leukemias
n = 50
top_cosine_similarities_leukemia  = w2c.n_most_similar_words_df(target_word=target_word,n=n)

In [None]:
top_cosine_similarities_leukemia.head()

In [None]:
target_word ='meshd001943' #breast neoplasms
n=50
top_cosine_similarities_bc = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
top_cosine_similarities_bc.head(n=10)

## Top n least similar words

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n = 50
least_cosine_similarities_leukemia  = w2c.n_least_similar_words_df(target_word=target_word,n=n)

In [None]:
least_cosine_similarities_leukemia.head()

In [None]:
target_word ='meshd001943' #breast neoplasms
n = 50
least_cosine_similarities_bc  = w2c.n_least_similar_words_df(target_word=target_word,n=n)

In [None]:
least_cosine_similarities_bc.head()

## t-SNE plot of the most and least similar words

In [None]:
def embeddings_similar_words(cosine_similarities_df, n):
    words = []
    for i in range(n):
        word = cosine_similarities_df.iloc[i][0]
        words.append(word)
    similar_words_embeddings_df = df.loc[words,:]#get the embedding of the word from df
    return  similar_words_embeddings_df

In [None]:
def plot_tsne(df):
    tsne_model = TSNE(learning_rate=50, n_jobs=10)
    tsne_features = tsne_model.fit_transform(df)
    # We can adjust the eps to get more or less clusters
    get_clusters = DBSCAN(eps=3, min_samples=10).fit_predict(tsne_features)
    df_similar_words["clusters"] = get_clusters
    set(get_clusters)
    # Here we are graphing  terms based on their word embeddings and visulized in a way that makes it
    # ideal to see clusters.
    df_similar_words['tsne_2d_one'] = tsne_features[:,0]
    df_similar_words['tsne_2d_two'] = tsne_features[:,1]
    sns.scatterplot(x="tsne_2d_one", y="tsne_2d_two", data=df, hue="clusters")
    plt.show()

In [None]:
words_leukemia = top_cosine_similarities_leukemia.append(least_cosine_similarities_leukemia)
n= len(words_leukemia)
similar_words_embeddings_df = embeddings_similar_words(words_leukemia, n)
plot_tsne(similar_words_embeddings_df)

In [None]:
words_breast_cancer = top_cosine_similarities_bc.append(least_cosine_similarities_bc)
n=len(words_breast_cancer)
similar_words_embeddings_df_bc = embeddings_similar_words(words_breast_cancer, n)
plot_tsne(similar_words_embeddings_df_bc)