# Install lib for Semantic Searching

In [None]:
!pip install pyvis;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install -U sentence-transformers;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Imports

In [None]:
# SENTENCE EMBEDINGS

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')

# #Our sentences we like to encode
# sentences = ['This framework generates embeddings for each input sentence',
#     'Sentences are passed as a list of string.',
#     'The quick brown fox jumps over the lazy dog.']

# #Sentences are encoded by calling model.encode()
# embeddings = model.encode(sentences)

# #Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")


# SENTENCES SIMILARITIES

# from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('all-MiniLM-L6-v2')

# #Sentences are encoded by calling model.encode()
# emb1 = model.encode("This is a red cat with a hat.")
# emb2 = model.encode("Have you seen my red cat?")

# cos_sim = util.cos_sim(emb1, emb2)
# print("Cosine-Similarity:", cos_sim)


# from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('all-MiniLM-L6-v2')

# sentences = ['A man is eating food.',
#           'A man is eating a piece of bread.',
#           'The girl is carrying a baby.',
#           'A man is riding a horse.',
#           'A woman is playing violin.',
#           'Two men pushed carts through the woods.',
#           'A man is riding a white horse on an enclosed ground.',
#           'A monkey is playing drums.',
#           'Someone in a gorilla costume is playing a set of drums.'
#           ]

# #Encode all sentences
# embeddings = model.encode(sentences)

# #Compute cosine similarity between all pairs
# cos_sim = util.cos_sim(embeddings, embeddings)

# #Add all pairs to a list with their cosine similarity score
# all_sentence_combinations = []
# for i in range(len(cos_sim)-1):
#     for j in range(i+1, len(cos_sim)):
#         all_sentence_combinations.append([cos_sim[i][j], i, j])

# #Sort list by the highest cosine similarity score
# all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

# print("Top-5 most similar pairs:")
# for score, i, j in all_sentence_combinations[0:5]:
#     print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from pyvis.network import Network
import torch

# Load Data (from drive)

In [None]:
# df = pd.read_csv("./content/drive/MyDrive/Saruma/data/COVID19-web.csv")
df = pd.read_csv('COVID19_web_CLEAN.csv')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Look for best feature for searching

In [None]:
len(df['SCIENTIFIC TITLE'][df['SCIENTIFIC TITLE'].isnull()])

0

In [None]:
len(df['PUBLIC TITLE'][df['PUBLIC TITLE'].isnull()])

0

In [None]:
len(df['PRIMARY OUTCOME'][df['PRIMARY OUTCOME'].isnull()])

0

# Semantic Search

Reference: [Semantic Search Lib](https://sbert.net/docs/installation.html)


## General Corpus for searching

In [None]:
corpus = df['SCIENTIFIC TITLE'].values

## Model and Embedding

In [None]:
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-6-v3')
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

## Search function

In [None]:
from numpy.ma.core import append

def search(query, answers):
  
  top_k = min(answers, len(corpus))

  query_embedding = model.encode(query, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest 5 scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)

  index = []
  scores = []

  for score, idx in zip(top_results[0], top_results[1]):
      #print(idx.item())
      #print(corpus[idx], "(Score: {:.4f})".format(score))
      index.append(idx.item())
      scores.append(score.item())
  return index, scores

# Graph

Reference: [Ploting Graphs python](https://pyvis.readthedocs.io/en/latest/tutorial.html)

In [None]:
"""
Search Query  = vaccines, covid, man 40, hipertesion
"""
query = 'vaccines'

primary  = 10
secondary = 3

In [None]:
search_index, search_scores = search(query, primary)

In [None]:
sources = [query for i in range(primary)] 
targets = [df.iloc[ind]['SCIENTIFIC TITLE'] for ind in search_index]

In [None]:
for i in targets:
  print(i)

A Norwegian Study of Vaccine Response to COVID-19 Vaccines in Patients Using Immunosuppressive Medication Within Rheumatology and Gastroenterology: the Nor-vaC Study
National Vaccine Adverse Event Reporting Survey to Determine the Etiology of Vaccine-Induced Injury
Human Pilot Test of an Oral Neutralizing Antibody Booster for Post-vaccinated People With COVID19 Vaccine
BCG Vaccine to Reduce Unplanned Absenteeism Due to Illness of Health Care Workers During the COVID-19 Pandemic. A Multi-center Randomised Controlled Trial (BCG-COVID-RCT)
The safety and efficacy of COVID-19 vaccines among lung transplant recipients: Non-randomised controlled cohort study - The safety and efficacy of COVID-19 vaccines among lung transplant recipients                                                                                                                                                                                                                                                                    

The structure for the graph is:

source1 -> target1 <br>
source1 -> target2 <br>
target1 -> target2 <br>

You can also add a specific weight for each of the connections.

In [None]:
got_net = Network(height='750px', width='100%', bgcolor='#222222', font_color='white', notebook=True)
edge_data = zip(sources, targets, search_scores)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, src, title=src)
    got_net.add_node(dst, dst, title=dst)
    got_net.add_edge(src, dst, value=w)

# got_net.show('/content/drive/MyDrive/Saruma/data/Search_Graph.html')
got_net.show('Search_Graph.html')

No funciona en colab, pueden bajar el html que se genera en drive y correrlo local para ver como se ve el grafo.