# Analogies
This notebook investigates analogies between wordvectors.
As input, we use a file with embeddings generated by [embiggen](https://pypi.org/project/embiggen/]) together
with a file with the corresponding word labels.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.cluster import DBSCAN


The following code allows us to import the ``kcet`` module from the local repository.

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from kcet import Wordvec2Cosine

The constructor of ``Wordvec2Cosine`` loads the word embeddings and words into a pandas dataframe.

In [None]:
data_directory = 'data/embeddings_final'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
embedding_file = os.path.join(data_directory, "embedding_SG_dim100_upto2020.npy")
words_file = os.path.join(data_directory, "words_SG_upto2020.txt")
w2c = Wordvec2Cosine(embeddings=embedding_file, words=words_file)
df_pubmed_words = w2c.get_embeddings()
df_pubmed_words.head()

## Embeddings of 4 words ;"tamoxifen", "breast cancer", "uterine cervical cancer", "Bleomycin"

In [None]:
#"meshd013629" tamoxifen
#meshd001943 breast cancer

#meshd001761 Bleomycin
#meshd002583 uterine cervical cancer

#meshc582435, pembrolizumab
#"meshd016889" endometrial cancer,


#meshd008558 Melphalan
#meshd010051 ovarian cancer


#meshd005185 fallopian tube cancer
#meshd000068258 Bevacizumab
#meshd016190 Carboplatin
#meshd002945 cysplatine

df_words = df_pubmed_words.loc[["meshd013629", "meshd001943","meshd001761","meshd002583","meshd008558","meshd010051"]]
#df_words = df_pubmed_words.loc[["meshc582435","meshd016889","meshd001761","meshd002583","meshd008558","meshd010051"]]
#df_words = df_pubmed_words.loc[["meshd013629", "meshd001943","meshd001761","meshd002583",  "meshc582435", "meshd016889"]] 
#df_words = df_pubmed_words.loc[["meshd013629", "meshd001943", "meshd001761","meshd002583"]]

In [None]:
df_words

# PCA visualization

In [None]:
cvec=["red","orange","blue","green","cyan","grey"]

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(df_words.values)
df = pd.DataFrame()
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
plt.figure(figsize=(8,8))

plt.scatter(pca_result[:,0],pca_result[:,1] ,c=cvec, alpha=0.9)
plt.xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100))
plt.ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 


In [None]:

# Creating figure
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
 
# Creating plot
ax.scatter3D(pca_result[:,0], pca_result[:,1], pca_result[:,2], color = cvec,alpha=0.9)
plt.title("3D scatter plot")
ax.set_xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100)) 
ax.set_ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 
ax.set_zlabel('PC 3 (%.2f%%)' % (pca.explained_variance_ratio_[2]*100))
 
# show plot
plt.show()