# Cosine similarities_2
This notebook illustrates how to calculate and display cosine similarities between wordvectors.
As input, we use a file with embeddings generated by [embiggen](https://pypi.org/project/embiggen/]) together
with a file with the corresponding word labels. This notebook  is specifically for the purpose of illustrating the words with highest cosine similatity scores with leukemia and ovarian neopplasms. Also, we illustrate the 3D scatter plot of the top 10 words with highest cos similarity to leukemia and also 10 random words from our corpus.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.cluster import DBSCAN


The following code allows us to import the ``kcet`` module from the local repository.

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from kcet import Wordvec2Cosine

The constructor of ``Wordvec2Cosine`` loads the word embeddings and words into a pandas dataframe.

In [None]:
data_directory = 'data/embeddings_final'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
embedding_file = os.path.join(data_directory, "embedding_SG_dim100_upto2020.npy")
words_file = os.path.join(data_directory, "words_SG_upto2020.txt")
w2c = Wordvec2Cosine(embeddings=embedding_file, words=words_file)
df = w2c.get_embeddings()
df.head()

## Get all cosine similarites between leukemia and all words 
We retrieve the top n most similar words. The function ``n_most_similar_words`` returns a list of tuples,
and ``n_most_similar_words_df`` returns a Pandas dataframe.

In [None]:
target_word ='meshd007938' # Leukemia Leukemias
n=df.shape[0]
all_cosine_similarities_leukemia = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
all_cosine_similarities_leukemia.head(n=6) #top 5 words which have highest cosine similarity scores between Leukemia and other words

## Random words and their cosine similarities to leukemia

In [None]:
import random 
num_rand_numbers = 10 # number of randomly genereated words
random_indices = []
for i in range(num_rand_numbers):
    rand_num = random.randint(0,df.shape[0])
    if rand_num is not random_indices:
        random_indices.append(random.randint(0,df.shape[0]))
        i =+ 1 
print(random_indices )       
#random_words_df = df.iloc[random_indices] 
#random_words =random_words_df.index

In [None]:
all_cosine_similarities_leukemia.loc[random_indices]

In [None]:
def embeddings_similar_words(cosine_similarities_df, n):#get the embeddings of words
    words = []
    for i in range(0,n):
        word = cosine_similarities_df.iloc[i][0]
        words.append(word)
    print(words)
    similar_words_embeddings_df = df.loc[words,:]
    return  similar_words_embeddings_df

In [None]:
leukemia_most_similar_and_random_words = all_cosine_similarities_leukemia.loc[0:10].append(all_cosine_similarities_leukemia.loc[random_indices])


In [None]:
n= len(leukemia_most_similar_and_random_words)
print(n)
df_similar_words_leukemia_random = embeddings_similar_words(leukemia_most_similar_and_random_words, n=n)

In [None]:
cvec=["red","blue","blue","blue","blue","blue","blue","blue", "blue", "blue","blue",
      "magenta","magenta","magenta","magenta","magenta","magenta","magenta","magenta","magenta","magenta" ]

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(df_similar_words_leukemia_random.values)
df = pd.DataFrame()
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
from mpl_toolkits.mplot3d import axes3d, Axes3D
# Creating figure
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
 
# Creating plot
ax.scatter3D(pca_result[:,0], pca_result[:,1], pca_result[:,2], color = cvec,alpha=0.9)
#plt.title("3D scatter plot")
ax.set_xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100)) 
ax.set_ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 
ax.set_zlabel('PC 3 (%.2f%%)' % (pca.explained_variance_ratio_[2]*100))
 
# show plot
plt.show()

In [None]:
plt.figure(figsize=(8,8))

plt.scatter(pca_result[:,0],pca_result[:,1] ,c=cvec, alpha=0.9)
plt.xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100))
plt.ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 

## Get the words with highest cosine similarities to ovarian neoplasms and plot them along with leukemia and its top words with highest cos similarity

In [None]:
target_word = "meshd010051" #ovarian neoplasms
n = 51
top_cosine_similarities_ovarian_cancer = w2c.n_most_similar_words_df(target_word=target_word, n=n)

In [None]:
leukemeia_ovarian_cancer = all_cosine_similarities_leukemia.loc[0:10].append(top_cosine_similarities_ovarian_cancer.loc[0:10])
leukemeia_ovarian_cancer

In [None]:
data_directory = 'data/embeddings_final'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
embedding_file = os.path.join(data_directory, "embedding_SG_dim100_upto2020.npy")
words_file = os.path.join(data_directory, "words_SG_upto2020.txt")
w2c = Wordvec2Cosine(embeddings=embedding_file, words=words_file)
df = w2c.get_embeddings()
df.head()

In [None]:
n= len(leukemeia_ovarian_cancer)
print(n)
df_similar_words_leukemia_ovarian_cancer = embeddings_similar_words(leukemeia_ovarian_cancer, n=n)

In [None]:
cvec=["red","blue","blue","blue","blue","blue","blue","blue", "blue", "blue","blue",
      "purple","green","green","green","green","green","green","green","green","green","green"]

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(df_similar_words_leukemia_ovarian_cancer.values)
df = pd.DataFrame()
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
from mpl_toolkits.mplot3d import axes3d, Axes3D
# Creating figure
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
 
# Creating plot
ax.scatter3D(pca_result[:,0], pca_result[:,1], pca_result[:,2], color = cvec,alpha=0.9)
#plt.title("3D scatter plot")
ax.set_xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100)) 
ax.set_ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 
ax.set_zlabel('PC 3 (%.2f%%)' % (pca.explained_variance_ratio_[2]*100))
 
# show plot
plt.show()

In [None]:
plt.figure(figsize=(8,8))

plt.scatter(pca_result[:,0],pca_result[:,1] ,c=cvec, alpha=0.9)
plt.xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100))
plt.ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 