In [10]:
import gdown
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# this section downloads the pre-processed file to user environemnt in colab
path = 'https://drive.google.com/file/d/1fPKeAyTcvc-txkxWy-Px28BGMnSVzKWY/view?usp=share_link'
id = path.split('/d/')[1].split('/')[0]
link = 'https://drive.google.com/uc?export=download&id='+id
name = 'datafile.json'
gdown.download(link, name)

Downloading...
From: https://drive.google.com/uc?export=download&id=1fPKeAyTcvc-txkxWy-Px28BGMnSVzKWY
To: /content/datafile.json
100%|██████████| 88.3M/88.3M [00:00<00:00, 187MB/s]


'datafile.json'

In [3]:
!pip install rank_bm25
!pip install streamlit
!pip install pyngrok

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting streamlit
  Downloading streamlit-1.33.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [13]:
# add athirosation token to ngrok service access
import streamlit as st
from pyngrok import ngrok

!ngrok config add-authtoken 2eBgswpP1QdUxillV6eeWhPF7Q0_6XoyysKL7fGVB45hNTsWg

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
%%writefile app.py

import streamlit as st
from pyngrok import ngrok
import json
import string
import nltk
import requests
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
import gensim
from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

def createIndex():
  path = '/content/datafile.json'
  tokenized_corpus, data, dictionary = get_data(path)
  bm25 = getBM25(tokenized_corpus)
  a, b, c = get_LSI_space(tokenized_corpus)
  return a, b, c, dictionary, data, bm25

def get_data(path):
  with open(path,'r') as file:
    data = json.load(file)

  documents = list(data.values())
  indices = list(data.keys())
  tokenized_corpus = [doc.split(" ") for doc in documents]
  dictionary = corpora.Dictionary(tokenized_corpus)

  return tokenized_corpus, data, dictionary

def getBM25(tokenized_corpus):
  k1 = 11.
  b = 1.
  epsilon = 0.25
  bm25 = BM25Okapi(tokenized_corpus, k1=k1, b=b, epsilon=epsilon)
  return bm25

def get_LSI_space(tokenized_corpus):
  # Create a Dictionary object that maps each word to a unique id, using a tokenized version of the corpus.
  dictionary = corpora.Dictionary(tokenized_corpus)

  # Convert the tokenized corpus into a bag-of-words (BoW) format using the previously created dictionary.
  corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
  tfidf = models.TfidfModel(corpus)
  corpus_tfidf = tfidf[corpus]
  lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
  lsi_corpus = lsi_model[corpus_tfidf]

  return lsi_corpus, corpus, lsi_model

def get_LSI_result(query, corpus, dictionary, lsi_model):
  tokens = word_tokenize(query.lower())
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  query = ' '.join(words)
  tokenized_query = word_tokenize(query.lower())
  query_bow = dictionary.doc2bow(tokenized_query)
  tfidf = models.TfidfModel(corpus)
  query_tfidf = tfidf[query_bow]
  query_lsi = lsi_model[query_tfidf]

  return query_lsi # (query, corpus, dictionary)

def bm25_result(query, bm25, nct_ids):
  tokenized_query = word_tokenize(query.lower())
  doc_scores = bm25.get_scores(tokenized_query)
  bm25_scores_dict = {nct_id: score for nct_id, score in zip(nct_ids, doc_scores)}
  return bm25_scores_dict

def normScores(bm25_scores_dict, lsi_scores):
  # Normalize BM25 scores to be between -1 and 1
  max_bm25 = max(bm25_scores_dict.values())
  min_bm25 = min(bm25_scores_dict.values())

  normalized_bm25_scores = {
      nct_id: -1 + 2 * (score - min_bm25) / (max_bm25 - min_bm25)
      if max_bm25 != min_bm25 else 0  # To handle the case where all scores are the same
      for nct_id, score in bm25_scores_dict.items()
  }

  max_lsi = max(lsi_scores)
  min_lsi = min(lsi_scores)

  normalized_lsi_scores = {nct_id: -1 + 2 * (score - min_lsi) / (max_lsi - min_lsi) for nct_id, score in zip(nct_ids, lsi_scores)}

  # Score weighing
  a1, a2 = 0.55, 0.45
  combined_scores_dict = {
      nct_id: a1 * normalized_bm25_scores.get(nct_id, 0) + a2 * normalized_lsi_scores.get(nct_id, 0)
      for nct_id in set(normalized_bm25_scores) | set(normalized_lsi_scores)
  }

  return combined_scores_dict

placeholder = st.empty()
placeholder.write('Building index...')
lsi_corpus, corpus, lsi_model, dictionary, data, bm25 = st.cache_data(createIndex)()
nct_ids = list(data.keys())
search_results = similarities.MatrixSimilarity(lsi_corpus, num_features=lsi_model.num_topics)
placeholder.empty()

st.title('Latent Semantic Search Engine')
st.subheader('Group 11 Coursework Part 3')
st.write('Search engine based on latent semantic indexing method, based on the dataset of Clinical Trials Track, consisting of 79,628 documents')
st.markdown("---")

query = st.text_input('Enter your query:', '')

if st.button('Search'):
  if query:
    st.markdown("---")
    placeholder2 = st.empty()
    placeholder2.write('Looking for documents matching query...')
    lsi_query = get_LSI_result(query, corpus, dictionary, lsi_model)
    bm25_scores_dict = bm25_result(query, bm25, nct_ids)
    lsi_scores = search_results[lsi_query]

    combined_scores_dict = normScores(bm25_scores_dict, lsi_scores)

    original_nct_ids = list(data.keys())
    ranked_docs_with_scores = sorted(combined_scores_dict.items(), key=lambda x: x[1], reverse=True)
    placeholder2.empty()
    st.write('Search results:')
    for doc_id, score in ranked_docs_with_scores[:10]:
      #st.markdown("---")
      st.write(f"Document ID: {doc_id}, Score: {round(score, 4)}")
      st.write(data[doc_id])

  else:
    st.write('Enter a valid query please')

Overwriting app.py


In [15]:
%%capture
!killall ngrok
!streamlit run /content/app.py &>/dev/null&
!pgrep streamlit
purl = ngrok.connect(8501)

In [16]:
from IPython.display import HTML
print('Click on the public url below to proceed to the search engine GUI, on the warning page click Visit Site to continue')
link = f'<a href="{purl.public_url}" target="_blank">{purl.public_url}</a>'
display(HTML(link))

