# Cosine similarities
This notebook illustrates how to calculate and display cosine similarities between wordvectors.
As input, we use a file with embeddings generated by [embiggen](https://pypi.org/project/embiggen/]) together
with a file with the corresponding word labels.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.cluster import DBSCAN


The following code allows us to import the ``kcet`` module from the local repository.

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from kcet import Wordvec2Cosine

The constructor of ``Wordvec2Cosine`` loads the word embeddings and words into a pandas dataframe.

In [None]:
data_directory = 'data/embeddings_final'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
embedding_file = os.path.join(data_directory, "embedding_SG_dim100_upto2020.npy")
words_file = os.path.join(data_directory, "words_SG_upto2020.txt")
w2c = Wordvec2Cosine(embeddings=embedding_file, words=words_file)
df = w2c.get_embeddings()
df.head()

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n= df.shape[0]
all_cosine_similarities_leukemia = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
plt.hist(all_cosine_similarities_leukemia["similarity"].astype('float'))

In [None]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot
...
fig = pyplot.figure()
ax = fig.add_subplot(1,1,1,)
n, bins, patches = ax.hist(all_cosine_similarities_leukemia["similarity"], bins=10, range=(-1, 1), histtype='bar')

#ax.set_xticklabels([n], rotation='vertical')

for patch in patches:
    patch.set_facecolor('r')

pyplot.xlabel('Cosine similarity')
pyplot.ylabel('Count')
pyplot.savefig("Count_cosine_leukemia")

## Top n most similar words
We retrieve the top n most similar words. The function ``n_most_similar_words`` returns a list of tuples,
and ``n_most_similar_words_df`` returns a Pandas dataframe.

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n=51
top_cosine_similarities_leukemia = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
top_cosine_similarities_leukemia.head(n=11)

In [None]:
p = plt.bar(range(0, 10000, 200), v, width=100)

In [None]:
target_word = 'ncbigene695' #BTK
n = 51
top_cosine_similarities_btk = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
top_cosine_similarities_btk.head(n=11)

In [None]:
target_word ='meshd001943' #breast neoplasms
n=51
top_cosine_similarities_bc = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
top_cosine_similarities_bc.head(n=11)

## Top n least similar words

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n = 50
least_cosine_similarities_leukemia  = w2c.n_least_similar_words_df(target_word=target_word,n=n)

In [None]:
least_cosine_similarities_leukemia.head(n=10)

In [None]:
target_word ='meshd001943' #breast neoplasms
n = 50
least_cosine_similarities_bc  = w2c.n_least_similar_words_df(target_word=target_word,n=n)

In [None]:
least_cosine_similarities_bc.head(n=10)

In [None]:
target_word = 'ncbigene695' #BTK
n = 50
least_cosine_similarities_btk  = w2c.n_least_similar_words_df(target_word=target_word,n=n)

In [None]:
least_cosine_similarities_btk.head(n=10)

## n words with close to zero cosine similarity scores

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n = 50
e = 0.1 #absolute cosine similarity scores less than e are considered as coine similarity scores close to zero
close_to_zero_similarities_leukemia  = w2c.n_close_to_zero_similar_words_df(target_word=target_word,n=n,e=e)

In [None]:
close_to_zero_similarities_leukemia.head()

In [None]:
close_to_zero_similarities_leukemia.shape[0]

In [None]:
target_word ='meshd001943' # breast neoplasms
n = 50
e = 0.1 #cosine similarity scores less than e are considered as coine similarity scores close to zero
close_to_zero_similarities_breast_neoplasms  = w2c.n_close_to_zero_similar_words_df(target_word=target_word,n=n,e=e)

In [None]:
close_to_zero_similarities_breast_neoplasms.head()

In [None]:
close_to_zero_similarities_breast_neoplasms.shape[0]

In [None]:
target_word = 'ncbigene695' #BTK 
n = 50
e = 0.1 #cosine similarity scores less than e are considered as coine similarity scores close to zero
close_to_zero_similarities_btk = w2c.n_close_to_zero_similar_words_df(target_word=target_word,n=n,e=e)

In [None]:
close_to_zero_similarities_btk.head()

In [None]:
close_to_zero_similarities_btk.shape[0]

## t-SNE plot of the most and least similar words

In [None]:
def embeddings_similar_words(cosine_similarities_df, n):#get the embeddings of words
    words = []
    for i in range(1,n):
        word = cosine_similarities_df.iloc[i][0]
        words.append(word)
    similar_words_embeddings_df = df.loc[words,:]
    return  similar_words_embeddings_df

In [None]:
def plot_tsne(df_similar_words):
    tsne_model = TSNE(learning_rate=50, n_jobs=10)
    tsne_features = tsne_model.fit_transform(df_similar_words)
    # We can adjust the eps to get more or less clusters
    get_clusters = DBSCAN(eps=3, min_samples=10).fit_predict(tsne_features)
    df_similar_words["clusters"] = get_clusters
    set(get_clusters)
    # Here we are graphing  terms based on their word embeddings and visulized in a way that makes it
    # ideal to see clusters.
    df_similar_words['tSNE_1'] = tsne_features[:,0]
    df_similar_words['tSNE_2'] = tsne_features[:,1]
    sns.scatterplot(x="tSNE_1", y="tSNE_2", data=df_similar_words, hue="clusters")
    plt.show()

In [None]:
words_leukemia = top_cosine_similarities_leukemia.append(least_cosine_similarities_leukemia)
#words_leukemia_all = words_leukemia.append(close_to_zero_similarities_leukemia)
n= len(words_leukemia)
similar_words_embeddings_df = embeddings_similar_words(words_leukemia, n)
plot_tsne(similar_words_embeddings_df)

In [None]:
words_breast_cancer = top_cosine_similarities_bc.append(least_cosine_similarities_bc)
n=len(words_breast_cancer)
similar_words_embeddings_df_bc = embeddings_similar_words(words_breast_cancer, n)
plot_tsne(similar_words_embeddings_df_bc)

In [None]:
words_btk = top_cosine_similarities_btk.append(least_cosine_similarities_btk)
n=len(words_btk)
similar_words_embeddings_df_btk = embeddings_similar_words(words_btk, n)
plot_tsne(similar_words_embeddings_df_btk)