<a href="https://colab.research.google.com/github/violetcodes/algo-refresher/blob/dev/compare_graph_embs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# !mkdir data
# # only used
# #https://gitlab.qdatalabs.com/research_and_development/healthcare_bot/blob/simon/src/resources/diseases_data_lower.json
# !cp diseases_data_lower.json data/

In [None]:
# !pip install node2vec

In [None]:
import json, os, time, plotly
from tqdm import tqdm
import numpy as np
import tensorflow_hub as tfhub
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import networkx as nx
from node2vec import Node2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

def load_json(fpath):
    with open(fpath, 'r') as f:
        return json.load(f)

In [None]:
datadir = '/content/data/'
all_data = load_json(os.path.join(datadir, 'diseases_data_lower.json'))

all_d = {i: j['definition'] for i, j in all_data.items()}
all_s = {i: j['definition'] for k in all_data.values() for i, j in k['symptoms'].items()}
sdnet = [(i+'_d', j+'_s', int(k['percentage'][:-1])/100) for i, info in all_data.items() for j, k in info['symptoms'].items()]


print(f'len of all_d: {len(all_d)}\nfew examples: {json.dumps(dict(list(all_d.items())[:5]), indent=2)}\n\n'+ \
      f'len of all_d: {len(all_s)}\nfew examples: {json.dumps(dict(list(all_d.items())[:5]), indent=2)}\n\n'+ \
      f'len of sdnet (unique): {len(sdnet), len(set(sdnet))}\nfew examples: {json.dumps(sdnet[:5], indent=2)}')

len of all_d: 801
few examples: {
  "intracerebral hemorrhage": "a cerebral hemorrhage or haemorrhage (or intracerebral hemorrhage, ich) is a subtype of intracranial hemorrhage that occurs within the brain tissue itself. intracerebral hemorrhage can be caused by brain trauma, or it can occur spontaneously in hemorrhagic stroke. non-traumatic intracerebral hemorrhage is a spontaneous bleeding into the brain tissue.",
  "intracranial hemorrhage": "an intracranial hemorrhage (ich) is a hemorrhage, or bleeding, within the skull.",
  "joint effusion": "a joint effusion is the presence of increased intra-articular fluid. it may affect any joint. commonly it involves the knee.",
  "iridocyclitis": "uveitis is, broadly, inflammation of the uvea. the uvea consists of the middle, pigmented, vascular structures of the eye and includes the iris, ciliary body, and choroid. uveitis requires an urgent referral and thorough examination by an ophthalmologist or optometrist\u2014and urgent treatment to 

In [None]:
def get_graph_with_text(
    all_d,
    all_s,
    sdnet,
    threshold = 0.5
    ):
  all_ents = {i.replace(' ', '_')+'_d': j for i, j in all_d.items()}
  all_ents.update({i.replace(' ', '_')+'_s':j for i, j in all_s.items()})

  vect = TfidfVectorizer(stop_words='english',encoding='utf-8',decode_error='ignore')
  M = vect.fit_transform(all_ents.values())
  feat_names = vect.get_feature_names()

  sdnet_tf = []
  indx = np.argwhere(M>threshold)


  for i, j in indx:
    node = list(all_ents.keys())[i]
    term = feat_names[j]
    name = ' '.join(node.split('_')[:-1])+'_'+node.split('_')[-1]
    term = term + '_TF'
    sdnet_tf.append((name, term, round(M[i, j], 3)))
  sdnet_tf = sdnet + sdnet_tf

  return sdnet_tf

sdnet_tf = get_graph_with_text(all_d, all_s, sdnet, threshold=0.5)
print(f'len of sdnet (unique): {len(sdnet_tf)}\nfew examples: {json.dumps(sdnet_tf[:5], indent=2)}\nlast few: {json.dumps(sdnet_tf[-5:], indent=2)}')

len of sdnet (unique): 9623
few examples: [
  [
    "intracerebral hemorrhage_d",
    "headache_s",
    0.63
  ],
  [
    "intracerebral hemorrhage_d",
    "dizziness_s",
    0.57
  ],
  [
    "intracerebral hemorrhage_d",
    "weakness_s",
    0.53
  ],
  [
    "intracerebral hemorrhage_d",
    "focal weakness_s",
    0.5
  ],
  [
    "intracerebral hemorrhage_d",
    "nausea_s",
    0.44
  ]
]
last few: [
  [
    "throat irritation_s",
    "throat_TF",
    0.59
  ],
  [
    "hip lump or mass_s",
    "hip_TF",
    0.753
  ],
  [
    "disturbance of smell or taste_s",
    "taste_TF",
    0.635
  ],
  [
    "wrist lump or mass_s",
    "wrist_TF",
    0.737
  ],
  [
    "hip swelling_s",
    "hip_TF",
    0.536
  ]
]


In [None]:
# get USE embedding
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = tfhub.load(module_url)
def embed(list_of_sentences):
    return model(list_of_sentences)

def get_use_embedding(dt):
    s, bt = 0, 16
    sentences = list(dt.values())
    embs = []
    for s in tqdm(range(0, len(dt), bt), 'getting embedding...'):
        emb_i = embed(sentences[s:s+bt])
        embs.extend(emb_i)
        s += bt
    assert len(embs) == len(sentences)
    return {i: j.numpy() for i, j in zip(dt.keys(), embs)}

# get deepwalk, node2vec graph, deepwalk_with_text embeddings with different settings
def get_graph_emb(graph, p=1, q=1, algo='node2vec', lm=0.5):
    if algo=='deepwalk':
        for i, j in graph.edges:
            if i.endswith('TF') or j .endswith('TF'):
                graph[i][j]['weight'] *= lm
        p=q=1
    # if p=q=1 then its same as deepwalk
    node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=100, workers=8)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return {i: model.wv[i] for i in graph.nodes}

def save_emb(filename, sym, dis):    
    with open(filename, 'w') as f:
        sym = {i: j.tolist() for i, j in sym.items()}
        dis = {i: j.tolist() for i, j in dis.items()}
        json.dump(dict(sym=sym, dis=dis), f)





In [None]:
# plotting TSNE
def get_tsne(emb_dicts):
    all_embs = [j for i in emb_dicts for j in i.values()]
    
    splits = [0]
    for i in emb_dicts:
        splits.append(splits[-1]+len(i))
    splits = splits[1:-1]
    
    tsne_v = TSNE().fit_transform(np.array(list(all_embs)))
    
    split_tsnes = np.split(tsne_v, splits)
    keys = [k.keys() for k in emb_dicts]    
    tsne_dicts = [dict(zip(k, tsne_)) for k, tsne_ in zip(keys, split_tsnes)] 
    return tsne_dicts

def get_tsne_trace(
    tsne_dict,
    color='blue',
    size=10,
    name='trace_name',):
    
    x_ = [i for i, j in tsne_dict.values()]
    y_ = [j for i, j in tsne_dict.values()]

    return go.Scatter(x=x_, y=y_, mode='markers',
                      marker=dict(color=color, size=size),
                      hoverinfo='text',
                      name=name,
                      marker_line_width=2,
                      text=list(tsne_dict.keys()))


def get_plotly(
    tsnes,
    names=None,
    save_name='jc_plotly.html',
    datadir='',
    title='',
    colors=None,
    from_embs=False
):
    if from_embs:
        '''we are given list of dict of embs insetead of tsne
        all embs should be from on same scale and normalized since
        we are going to use one tsne model on all of them'''
        tsnes = get_tsne(tsnes)       
    
    colors = colors or ['red', 'blue', 'yellow', 'cyan', 'pink', 'white']
    names = names or ['']*len(tsnes)

    traces = [get_tsne_trace(tsne_dict, color=colors[i], name=names[i]) for i, tsne_dict in enumerate(tsnes)]
    
    fig = go.Figure(data=traces)
    fig.layout = go.Layout(
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        title=dict(
            text=title,
            x=0.50,
            y=0.01,
            xanchor='center',
            yanchor='bottom'),
        margin=dict(t=0, b=0, l=0, r=0),
        legend=dict(x=0, y=0),
        legend_orientation='h',
        hoverlabel=dict(
            font_size=26,
            font_family='roboto',
        ),
        )
    

    
    time_ = time.strftime('%m-%d %H:%M')
    plotly_html = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
    top_append = f'<!--created on {time_}-->\n' + \
    '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script> \n'
    with open(os.path.join(datadir, save_name), 'w') as f:
        f.write(top_append)
        f.write(plotly_html)
    return fig

In [None]:
def use_embedding_tsne_plot():
    #required all_s, all_d, dictionary of sym and dis definitions
    sym = get_use_embedding(all_s)
    dis = get_use_embedding(all_d)
    fig1 = get_plotly(        
        tsnes=[sym, dis],
        names=['symptoms', 'disease'],
        colors=['cyan', 'limegreen'],
        save_name=f'use_embedding_tsne.html',
        datadir=datadir, # if datadir is not define replace it with ''
        title='TSNE plot of embeddings of Symptoms and Diseases using USE on their definitions',        
        from_embs=True
    )
    return fig1

In [None]:
def graph_embedding_tsne_plot(
    edgelist,
    algo='deepwalk',
    p=1,
    q=1,
    lm=0,
    from_file=False,
    filename=None
):    
    if not from_file:        
        graph = nx.Graph()
        graph.add_weighted_edges_from(edgelist)    
        g_emb = get_graph_emb(graph, p=p, q=q, algo=algo, lm=lm)
        sym = {i[:-2]: j for i, j in g_emb.items() if i.endswith('_s')}
        dis = {i[:-2]: j for i, j in g_emb.items() if i.endswith('_d')}

        filename = os.path.join(datadir, f'emb_{algo}{p, q, lm}.json')
        save_emb(filename, sym, dis)    
    
    else:
        sym, dis = load_json(filename).values()
        
    print('Generating fig1')        
    fig1 = get_plotly(        
        [sym, dis],
        names=['symptoms', 'disease'],
        colors=['cyan', 'limegreen'],
        save_name=f'graph_embedding_{algo}{p, q, lm}_tsne.html',
        datadir=datadir, # if datadir is not define replace it with ''
        title=f'TSNE plot of embeddings of Symptoms and Diseases using graph embedding ({algo})',        
        from_embs=True
    )
    return fig1

In [None]:
# fig = use_embedding_tsne_plot()
# fig2 = graph_embedding_tsne_plot(sdnet)
# fig3 = graph_embedding_tsne_plot(sdnet, 'node2vec', p=2, q=0.75)
# fig4 = graph_embedding_tsne_plot(sdnet_tf, lm=0.5)

path2 = '/content/data/emb_deepwalk(1, 1, 0).json'
path3 = '/content/data/emb_node2vec(2, 0.75, 0).json'
path4 = '/content/data/emb_deepwalk(1, 1, 0.5).json'

# fig5 = graph_embedding_tsne_plot(sdnet, from_file=True, filename=path2)
# fig6 = graph_embedding_tsne_plot(sdnet, 'node2vec', p=2, q=0.75, from_file=True, filename=path3)
# fig7 = graph_embedding_tsne_plot(sdnet_tf, lm=0.5, from_file=True, filename=path4)

In [None]:
#push to gcs
from google.colab import auth
auth.authenticate_user()



In [None]:
!gsutil -m cp -r data gs://healthcare_rd/h-bot/graph_emb_htmls

Copying file://data/graph_embedding_node2vec(2, 0.75, 0)_tsne.html [Content-Type=text/html]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                    Copying file://data/emb_deepwalk(1, 1, 0.5).json [Content-Type=application/json]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                    Copying file://data/graph_embedding_deepwalk(1, 1, 0)_tsne.html [Content-Type=text/html]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                    Copying file://data/use_embedding_tsne.html [Content-Type=text/html]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                    Copying file://data/graph_embedding_deepwalk(1, 1, 0.5)_tsne.html [Content-Type=text/html]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                    Copying file://data/diseases_data_lower.json [Content-Type=application/json]...
/ [0/9 files][    0.0 B/  8.0 MiB]   0% Done                                  

In [None]:
!gsutil ls gs://healthcare_rd/h-bot/graph_emb_htmls

gs://healthcare_rd/h-bot/graph_emb_htmls/diseases_data_lower.json
gs://healthcare_rd/h-bot/graph_emb_htmls/emb_deepwalk(1, 1, 0).json
gs://healthcare_rd/h-bot/graph_emb_htmls/emb_deepwalk(1, 1, 0.5).json
gs://healthcare_rd/h-bot/graph_emb_htmls/emb_node2vec(2, 0.75, 0).json
gs://healthcare_rd/h-bot/graph_emb_htmls/graph_embedding_deepwalk(1, 1, 0)_tsne.html
gs://healthcare_rd/h-bot/graph_emb_htmls/graph_embedding_deepwalk(1, 1, 0.5)_tsne.html
gs://healthcare_rd/h-bot/graph_emb_htmls/graph_embedding_node2vec(2, 0.75, 0)_tsne.html
gs://healthcare_rd/h-bot/graph_emb_htmls/use_embedding_tsne.html
gs://healthcare_rd/h-bot/graph_emb_htmls/use_embedding_tsne2.html


In [None]:
for i in [fig, fig2, fig3, fig4, fig5, fig6, fig7]:
  i.show()