In [None]:
!pip install flask-ngrok



In [None]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk



In [None]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok
import re
import pandas as pd
import nltk
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from nltk.stem import *
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import os

In [None]:
import pyterrier as pt

if not pt.started():
  # In this lab, we need to specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

In [None]:
# Function to ensure directory exists
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to convert JSONL to CSV
def jsonl_to_csv(jsonl_file_path, csv_file_path):
    ensure_dir(csv_file_path)
    df = pd.read_json(jsonl_file_path , lines=True)
    df.to_csv(csv_file_path, index=False)

# Function to convert TSV to CSV
def tsv_to_csv(tsv_file_path, csv_file_path):
    ensure_dir(csv_file_path)
    df = pd.read_csv(tsv_file_path, sep='\t')
    df.to_csv(csv_file_path, index=False)

# Paths for JSONL files
corpus_jsonl_path = '/content/corpus.jsonl'
queries_jsonl_path = '/content/queries.jsonl'

# Paths for TSV files
test_tsv_path = '/content/test.tsv'

# Output CSV paths
corpus_csv_path = '/mnt/data/corpus.csv'
queries_csv_path = '/mnt/data/queries.csv'
test_csv_path = '/mnt/data/test.csv'


# Convert files
jsonl_to_csv(corpus_jsonl_path, corpus_csv_path)
jsonl_to_csv(queries_jsonl_path, queries_csv_path)
tsv_to_csv(test_tsv_path, test_csv_path)

In [None]:
# Display dataframes
corpus_df = pd.read_csv(corpus_csv_path)
corpus_df = corpus_df.rename(columns={'_id': 'docno'})
corpus_df

Unnamed: 0,docno,title,text,metadata
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', 'pubmed_id': '11472636'}"
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/', 'pubmed_id': '11667967'}"
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/', 'pubmed_id': '11667972'}"
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/', 'pubmed_id': '11686871'}"
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/', 'pubmed_id': '11686888'}"
...,...,...,...,...
171327,7e8r61e7,Can Pediatric COVID-19 Testing Sensitivity Be Improved With Sequential Tests?,,"{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/32427657/; https://doi.org/10.1213/ane.0000000000004982', 'pubmed_id': '32427657'}"
171328,6jittbis,Heterogeneity and plasticity of porcine alveolar macrophage and pulmonary interstitial macrophage isolated from healthy pigs in vitro,This study investigated the heterogeneity and plasticity of porcine alveolar macrophages (PAM) and pulmonary interstitial macrophages (IM) isolate...,"{'url': 'https://doi.org/10.1242/bio.046342; https://www.ncbi.nlm.nih.gov/pubmed/31615770/', 'pubmed_id': '31615770'}"
171329,hi8k8wvb,SARS E protein in phospholipid bilayers: an anomalous X-ray reflectivity study,Abstract We report on an anomalous X-ray reflectivity study to locate a labelled residue of a membrane protein with respect to the lipid bilayer. ...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/32288217/; https://doi.org/10.1016/j.physb.2004.11.015; https://www.sciencedirect.com/science/article...
171330,ma3ndg41,Italian Society of Interventional Cardiology (GISE) position paper for Cath lab‐specific preparedness recommendations for healthcare providers in ...,COVID‐19 pandemic raised the issue to guarantee the proper level of care to patients with acute cardiovascular diseases and concomitant suspected ...,"{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/32223063/; https://doi.org/10.1002/ccd.28888', 'pubmed_id': '32223063'}"


In [None]:
queries_df = pd.read_csv(queries_csv_path)
queries_df = queries_df.rename(columns={'_id': 'qid'})
queries_df

Unnamed: 0,qid,text,metadata
0,1,what is the origin of COVID-19,"{'query': 'coronavirus origin', 'narrative': ""seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal so..."
1,2,how does the coronavirus respond to changes in the weather,"{'query': 'coronavirus response to weather changes', 'narrative': 'seeking range of information about the SARS-CoV-2 virus viability in different ..."
2,3,will SARS-CoV2 infected people develop immunity? Is cross protection possible?,"{'query': 'coronavirus immunity', 'narrative': 'seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained d..."
3,4,what causes death from Covid-19?,"{'query': 'how do people die from the coronavirus', 'narrative': 'Studies looking at mechanisms of death from Covid-19.'}"
4,5,what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?,"{'query': 'animal models of COVID-19', 'narrative': 'Papers that describe the results of testing drugs that bind to spike proteins of the virus o..."
5,6,what types of rapid testing for Covid-19 have been developed?,"{'query': 'coronavirus test rapid testing', 'narrative': 'Looking for studies identifying ways to diagnose Covid-19 more rapidly.'}"
6,7,are there serological tests that detect antibodies to coronavirus?,"{'query': 'serological tests for coronavirus', 'narrative': 'Looking for assays that measure immune response to COVID-19 that will help determine ..."
7,8,how has lack of testing availability led to underreporting of true incidence of Covid-19?,"{'query': 'coronavirus under reporting', 'narrative': 'Looking for studies answering questions of impact of lack of complete testing for Covid-19 ..."
8,9,how has COVID-19 affected Canada,"{'query': 'coronavirus in Canada', 'narrative': 'seeking data related to infections (confirm, suspected, and projected) and health outcomes (sympt..."
9,10,has social distancing had an impact on slowing the spread of COVID-19?,"{'query': 'coronavirus social distancing impact', 'narrative': ""seeking specific information on studies that have measured COVID-19's transmission..."


In [None]:
test_df = pd.read_csv(test_csv_path)
test_df

Unnamed: 0,query-id,corpus-id,score
0,1,005b2j4b,2
1,1,00fmeepz,1
2,1,g7dhmyyo,2
3,1,0194oljo,1
4,1,021q9884,1
...,...,...,...
66331,50,zvop8bxh,2
66332,50,zwf26o63,1
66333,50,zwsvlnwe,0
66334,50,zxr01yln,1


In [None]:
corpus_df = corpus_df.head(2000)
corpus_df

Unnamed: 0,docno,title,text,metadata
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', 'pubmed_id': '11472636'}"
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/', 'pubmed_id': '11667967'}"
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/', 'pubmed_id': '11667972'}"
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/', 'pubmed_id': '11686871'}"
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/', 'pubmed_id': '11686888'}"
...,...,...,...,...
1995,m31mb2kr,Antibody Derived Peptides for Detection of Ebola Virus Glycoprotein,BACKGROUND: Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios. Different immune-based assays have bee...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619498/', 'pubmed_id': '26489048'}"
1996,ov1qssnu,Formalin Inactivation of Japanese Encephalitis Virus Vaccine Alters the Antigenicity and Immunogenicity of a Neutralization Epitope in Envelope Pr...,"Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available, but the effects of formalin inactivation on the antigenic st...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619746/', 'pubmed_id': '26495991'}"
1997,tu7eufzd,Prognosis of nonspecific interstitial pneumonia correlates with perivascular CD4+ T lymphocyte infiltration of the lung,"BACKGROUND: Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes, and subpopulations of these ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619990/', 'pubmed_id': '26496721'}"
1998,i24mfi7d,"Trypsin- and low pH-mediated fusogenicity of avian metapneumovirus fusion proteins is determined by residues at positions 100, 101 and 294",Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae. Metapneumov...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4620442/', 'pubmed_id': '26498473'}"


Preprocessing:

In [None]:
from nltk.stem import *
from nltk.stem.porter import *
# Initialize Porter stemmer
stemmer = PorterStemmer()
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# 1) Clean

def clean(text):
   text = re.sub(r"http\S+", " ", str(text)) # remove urls
   text = re.sub(r"RT ", " ", str(text)) # remove rt
   text = re.sub(r"@[\w]*", " ", str(text)) # remove handles
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", str(text)) # remove special characters
   text = re.sub(r'\t', ' ', str(text)) # remove tabs
   text = re.sub(r'\n', ' ', str(text)) # remove line jump
   text = re.sub(r"\s+", " ", str(text)) # remove extra white space
   text = str(text).strip()
   return str(text)

corpus_df["processed_text"] = corpus_df["text"].apply(clean)
corpus_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df["processed_text"] = corpus_df["text"].apply(clean)


Unnamed: 0,docno,title,text,metadata,processed_text
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', 'pubmed_id': '11472636'}",OBJECTIVE This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumonia...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/', 'pubmed_id': '11667967'}",Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/', 'pubmed_id': '11667972'}",Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens and contributes to immune and infla...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/', 'pubmed_id': '11686871'}",Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases ET-1 is a potent mit...
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/', 'pubmed_id': '11686888'}",Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae subfamily pneumovirus which cause cl...
...,...,...,...,...,...
1995,m31mb2kr,Antibody Derived Peptides for Detection of Ebola Virus Glycoprotein,BACKGROUND: Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios. Different immune-based assays have bee...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619498/', 'pubmed_id': '26489048'}",BACKGROUND Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios Different immune-based assays have been ...
1996,ov1qssnu,Formalin Inactivation of Japanese Encephalitis Virus Vaccine Alters the Antigenicity and Immunogenicity of a Neutralization Epitope in Envelope Pr...,"Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available, but the effects of formalin inactivation on the antigenic st...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619746/', 'pubmed_id': '26495991'}",Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available but the effects of formalin inactivation on the antigenic str...
1997,tu7eufzd,Prognosis of nonspecific interstitial pneumonia correlates with perivascular CD4+ T lymphocyte infiltration of the lung,"BACKGROUND: Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes, and subpopulations of these ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619990/', 'pubmed_id': '26496721'}",BACKGROUND Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes and subpopulations of these ce...
1998,i24mfi7d,"Trypsin- and low pH-mediated fusogenicity of avian metapneumovirus fusion proteins is determined by residues at positions 100, 101 and 294",Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae. Metapneumov...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4620442/', 'pubmed_id': '26498473'}",Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae Metapneumovi...


In [None]:
# 2) Remove stop words

def remove_stop(text) :

  tokens = word_tokenize(str(text))
  sentence = []

  for i in tokens:
    if i not in stop_words:
      sentence.append(i)

  return' '.join(sentence)


corpus_df["processed_text"] = corpus_df["processed_text"].apply(remove_stop)
corpus_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df["processed_text"] = corpus_df["processed_text"].apply(remove_stop)


Unnamed: 0,docno,title,text,metadata,processed_text
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', 'pubmed_id': '11472636'}",OBJECTIVE This retrospective chart review describes epidemiology clinical features 40 patients culture-proven Mycoplasma pneumoniae infections Kin...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/', 'pubmed_id': '11667967'}",Inflammatory diseases respiratory tract commonly associated elevated production nitric oxide ( NO• ) increased indices NO• -dependent oxidative st...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/', 'pubmed_id': '11667972'}",Surfactant protein-D ( SP-D ) participates innate response inhaled microorganisms organic antigens contributes immune inflammatory regulation with...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/', 'pubmed_id': '11686871'}",Endothelin-1 ( ET-1 ) 21 amino acid peptide diverse biological activity implicated numerous diseases ET-1 potent mitogen regulator smooth muscle t...
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/', 'pubmed_id': '11686888'}",Respiratory syncytial virus ( RSV ) pneumonia virus mice ( PVM ) viruses family Paramyxoviridae subfamily pneumovirus cause clinically important r...
...,...,...,...,...,...
1995,m31mb2kr,Antibody Derived Peptides for Detection of Ebola Virus Glycoprotein,BACKGROUND: Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios. Different immune-based assays have bee...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619498/', 'pubmed_id': '26489048'}",BACKGROUND Current Ebola virus ( EBOV ) detection methods costly impractical epidemic scenarios Different immune-based assays reported detection q...
1996,ov1qssnu,Formalin Inactivation of Japanese Encephalitis Virus Vaccine Alters the Antigenicity and Immunogenicity of a Neutralization Epitope in Envelope Pr...,"Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available, but the effects of formalin inactivation on the antigenic st...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619746/', 'pubmed_id': '26495991'}",Formalin-inactivated Japanese encephalitis virus ( JEV ) vaccines widely available effects formalin inactivation antigenic structure JEV profile a...
1997,tu7eufzd,Prognosis of nonspecific interstitial pneumonia correlates with perivascular CD4+ T lymphocyte infiltration of the lung,"BACKGROUND: Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes, and subpopulations of these ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619990/', 'pubmed_id': '26496721'}",BACKGROUND Nonspecific interstitial pneumonia ( NSIP ) characterized interstitial infiltration T lymphocytes subpopulations cells may associated p...
1998,i24mfi7d,"Trypsin- and low pH-mediated fusogenicity of avian metapneumovirus fusion proteins is determined by residues at positions 100, 101 and 294",Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae. Metapneumov...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4620442/', 'pubmed_id': '26498473'}",Avian metapneumovirus ( aMPV ) human metapneumovirus ( hMPV ) members genus Metapneumovirus subfamily Pneumovirinae Metapneumovirus fusion ( F ) p...


In [None]:
# 3) Steeming

def steeming(text) :

  tokens = word_tokenize(text)
  steemed_text = []
  j = 0

  for i in tokens:
    steemed_text.append(stemmer.stem(i))
    #print(f"tokens : {steemed_text[j]}")
    j = j+1


  return ' '.join(steemed_text)

corpus_df["processed_text"] = corpus_df["processed_text"].apply(steeming)
corpus_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df["processed_text"] = corpus_df["processed_text"].apply(steeming)


Unnamed: 0,docno,title,text,metadata,processed_text
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia",OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/', 'pubmed_id': '11472636'}",object thi retrospect chart review describ epidemiolog clinic featur 40 patient culture-proven mycoplasma pneumonia infect king abdulaziz univers ...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in lung disease?,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/', 'pubmed_id': '11667967'}",inflammatori diseas respiratori tract commonli associ elev product nitric oxid ( no• ) increas indic no• -depend oxid stress although no• known an...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/', 'pubmed_id': '11667972'}",surfact protein-d ( sp-d ) particip innat respons inhal microorgan organ antigen contribut immun inflammatori regul within lung sp-d synthes secre...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/', 'pubmed_id': '11686871'}",endothelin-1 ( et-1 ) 21 amino acid peptid divers biolog activ implic numer diseas et-1 potent mitogen regul smooth muscl tone inflammatori mediat...
4,9785vg6d,Gene expression in epithelial cells in response to pneumovirus infection,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/', 'pubmed_id': '11686888'}",respiratori syncyti viru ( rsv ) pneumonia viru mice ( pvm ) virus famili paramyxovirida subfamili pneumoviru caus clinic import respiratori infec...
...,...,...,...,...,...
1995,m31mb2kr,Antibody Derived Peptides for Detection of Ebola Virus Glycoprotein,BACKGROUND: Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios. Different immune-based assays have bee...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619498/', 'pubmed_id': '26489048'}",background current ebola viru ( ebov ) detect method costli impract epidem scenario differ immune-bas assay report detect quantif ebola viru ( ebo...
1996,ov1qssnu,Formalin Inactivation of Japanese Encephalitis Virus Vaccine Alters the Antigenicity and Immunogenicity of a Neutralization Epitope in Envelope Pr...,"Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available, but the effects of formalin inactivation on the antigenic st...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619746/', 'pubmed_id': '26495991'}",formalin-inactiv japanes enceph viru ( jev ) vaccin wide avail effect formalin inactiv antigen structur jev profil antibodi elicit vaccin well und...
1997,tu7eufzd,Prognosis of nonspecific interstitial pneumonia correlates with perivascular CD4+ T lymphocyte infiltration of the lung,"BACKGROUND: Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes, and subpopulations of these ...","{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4619990/', 'pubmed_id': '26496721'}",background nonspecif interstiti pneumonia ( nsip ) character interstiti infiltr t lymphocyt subpopul cell may associ progress fibrosi howev studi ...
1998,i24mfi7d,"Trypsin- and low pH-mediated fusogenicity of avian metapneumovirus fusion proteins is determined by residues at positions 100, 101 and 294",Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae. Metapneumov...,"{'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4620442/', 'pubmed_id': '26498473'}",avian metapneumoviru ( ampv ) human metapneumoviru ( hmpv ) member genu metapneumoviru subfamili pneumovirina metapneumoviru fusion ( f ) protein ...


In [None]:
def preprocess(sentence):
  sentence = remove_stop(sentence)
  sentence = clean(sentence)
  sentence = steeming(sentence)

  return sentence

In [None]:
queries_df['processed_text'] = queries_df['text'].apply(lambda x: ' '.join(preprocess(str(x))))
queries_df

Unnamed: 0,qid,text,metadata,processed_text
0,1,what is the origin of COVID-19,"{'query': 'coronavirus origin', 'narrative': ""seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal so...",o r i g i n c o v i d - 1 9
1,2,how does the coronavirus respond to changes in the weather,"{'query': 'coronavirus response to weather changes', 'narrative': 'seeking range of information about the SARS-CoV-2 virus viability in different ...",c o r o n a v i r u r e s p o n d c h a n g w e a t h e r
2,3,will SARS-CoV2 infected people develop immunity? Is cross protection possible?,"{'query': 'coronavirus immunity', 'narrative': 'seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained d...",s a r s - c o v 2 i n f e c t p e o p l d e v e l o p i m m u n i s c r o s s p r o t e c t p o s s i b l
3,4,what causes death from Covid-19?,"{'query': 'how do people die from the coronavirus', 'narrative': 'Studies looking at mechanisms of death from Covid-19.'}",c a u s d e a t h c o v i d - 1 9
4,5,what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?,"{'query': 'animal models of COVID-19', 'narrative': 'Papers that describe the results of testing drugs that bind to spike proteins of the virus o...",d r u g a c t i v s a r s - c o v s a r s - c o v - 2 a n i m s t u d i
5,6,what types of rapid testing for Covid-19 have been developed?,"{'query': 'coronavirus test rapid testing', 'narrative': 'Looking for studies identifying ways to diagnose Covid-19 more rapidly.'}",t y p e r a p i d t e s t c o v i d - 1 9 d e v e l o p
6,7,are there serological tests that detect antibodies to coronavirus?,"{'query': 'serological tests for coronavirus', 'narrative': 'Looking for assays that measure immune response to COVID-19 that will help determine ...",s e r o l o g t e s t d e t e c t a n t i b o d i c o r o n a v i r u
7,8,how has lack of testing availability led to underreporting of true incidence of Covid-19?,"{'query': 'coronavirus under reporting', 'narrative': 'Looking for studies answering questions of impact of lack of complete testing for Covid-19 ...",l a c k t e s t a v a i l l e d u n d e r r e p o r t t r u e i n c i d c o v i d - 1 9
8,9,how has COVID-19 affected Canada,"{'query': 'coronavirus in Canada', 'narrative': 'seeking data related to infections (confirm, suspected, and projected) and health outcomes (sympt...",c o v i d - 1 9 a f f e c t c a n a d a
9,10,has social distancing had an impact on slowing the spread of COVID-19?,"{'query': 'coronavirus social distancing impact', 'narrative': ""seeking specific information on studies that have measured COVID-19's transmission...",s o c i a l d i s t a n c i m p a c t s l o w s p r e a d c o v i d - 1 9


In [None]:
corpusDf = corpus_df[['docno', 'text', 'processed_text']]
corpusDf

Unnamed: 0,docno,text,processed_text
0,ug7v899j,OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoni...,object thi retrospect chart review describ epidemiolog clinic featur 40 patient culture-proven mycoplasma pneumonia infect king abdulaziz univers ...
1,02tnwd4m,Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO•...,inflammatori diseas respiratori tract commonli associ elev product nitric oxid ( no• ) increas indic no• -depend oxid stress although no• known an...
2,ejv2xln0,"Surfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and infl...",surfact protein-d ( sp-d ) particip innat respons inhal microorgan organ antigen contribut immun inflammatori regul within lung sp-d synthes secre...
3,2b73a28n,Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mi...,endothelin-1 ( et-1 ) 21 amino acid peptid divers biolog activ implic numer diseas et-1 potent mitogen regul smooth muscl tone inflammatori mediat...
4,9785vg6d,"Respiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause ...",respiratori syncyti viru ( rsv ) pneumonia viru mice ( pvm ) virus famili paramyxovirida subfamili pneumoviru caus clinic import respiratori infec...
...,...,...,...
1995,m31mb2kr,BACKGROUND: Current Ebola virus (EBOV) detection methods are costly and impractical for epidemic scenarios. Different immune-based assays have bee...,background current ebola viru ( ebov ) detect method costli impract epidem scenario differ immune-bas assay report detect quantif ebola viru ( ebo...
1996,ov1qssnu,"Formalin-inactivated Japanese encephalitis virus (JEV) vaccines are widely available, but the effects of formalin inactivation on the antigenic st...",formalin-inactiv japanes enceph viru ( jev ) vaccin wide avail effect formalin inactiv antigen structur jev profil antibodi elicit vaccin well und...
1997,tu7eufzd,"BACKGROUND: Nonspecific interstitial pneumonia (NSIP) is characterized by interstitial infiltration of T lymphocytes, and subpopulations of these ...",background nonspecif interstiti pneumonia ( nsip ) character interstiti infiltr t lymphocyt subpopul cell may associ progress fibrosi howev studi ...
1998,i24mfi7d,Avian metapneumovirus (aMPV) and human metapneumovirus (hMPV) are members of the genus Metapneumovirus in the subfamily Pneumovirinae. Metapneumov...,avian metapneumoviru ( ampv ) human metapneumoviru ( hmpv ) member genu metapneumoviru subfamili pneumovirina metapneumoviru fusion ( f ) protein ...


Indexing:

In [None]:
# Create and index documents
corpus_df['docno'] = corpus_df['docno'].astype(str)

indexer = pt.DFIndexer("./index", overwrite=True)
index_ref = indexer.index(corpus_df['processed_text'], corpus_df['docno'])
index = pt.IndexFactory.of(index_ref)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df['docno'] = corpus_df['docno'].astype(str)


In [None]:
print(index_ref.toString())
#we will first load the index
index = pt.IndexFactory.of(index_ref)
#we will call getCollectionStatistics() to check the stats
print(index.getCollectionStatistics().toString())

./index/data.properties
Number of documents: 2000
Number of terms: 14735
Number of postings: 172042
Number of fields: 0
Number of tokens: 262343
Field names: []
Positions:   false



In [None]:
for kv in index.getLexicon():
  print("%s -> %s " % (kv.getKey(), kv.getValue().toString()))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
onchocerciasi -> term9149 Nt=1 TF=1 maxTF=1 @{0 141032 1} 
oncogen -> term7399 Nt=6 TF=6 maxTF=1 @{0 141034 5} 
oncogenesi -> term10497 Nt=1 TF=1 maxTF=1 @{0 141047 5} 
oncoinflamm -> term10498 Nt=1 TF=1 maxTF=1 @{0 141050 3} 
oncolog -> term9929 Nt=4 TF=4 maxTF=1 @{0 141053 1} 
oncologist -> term12490 Nt=2 TF=2 maxTF=1 @{0 141061 7} 
oncolyt -> term9717 Nt=2 TF=3 maxTF=2 @{0 141066 7} 
oncoretrovir -> term8747 Nt=1 TF=1 maxTF=1 @{0 141072 0} 
oncorhynchu -> term14303 Nt=1 TF=1 maxTF=1 @{0 141074 4} 
oncornaviru -> term5234 Nt=1 TF=1 maxTF=1 @{0 141077 2} 
oncostatin -> term12012 Nt=1 TF=2 maxTF=2 @{0 141079 4} 
oncoviru -> term13248 Nt=1 TF=1 maxTF=1 @{0 141082 3} 
onfh -> term11644 Nt=1 TF=3 maxTF=3 @{0 141085 1} 
ongo -> term1934 Nt=31 TF=35 maxTF=2 @{0 141088 1} 
onli -> term3514 Nt=20 TF=22 maxTF=2 @{0 141132 5} 
onlin -> term1753 Nt=67 TF=68 maxTF=2 @{0 141163 1} 
onoo -> term11813 Nt=1 TF=1 maxTF=1 @{0 141238 0} 
o

In [None]:
# Define the tf function
def tf(processed_docs):
    # Initialize a dictionary to store term frequency for each term in the collection
    tf_dict = {}

    # Check if the processed_docs list is empty
    if not processed_docs:
        print("Error: No documents to compare.")
        return

    # Iterate over each document in the collection
    for doc_id, doc_text in enumerate(processed_docs):
        # Tokenize and stem the current document
        doc_tokens = doc_text.split()

        # Calculate term frequency for each term in the document
        for term in doc_tokens:
            if term not in tf_dict:
                tf_dict[term] = {}  # Initialize a dictionary for the term if not present
            if doc_id not in tf_dict[term]:
                tf_dict[term][doc_id] = 0  # Initialize term frequency for the document if not present
            tf_dict[term][doc_id] += 1  # Increment term frequency for the document

    return tf_dict

# Example usage:
processed_docs = corpusDf['processed_text'].tolist()
tf_dict = tf(processed_docs)
print(tf_dict)

{'object': {0: 1, 33: 1, 117: 1, 126: 1, 137: 1, 187: 1, 189: 1, 263: 1, 277: 1, 284: 1, 322: 1, 408: 1, 426: 1, 451: 1, 457: 1, 465: 3, 471: 1, 487: 1, 572: 1, 587: 1, 606: 1, 607: 2, 618: 1, 632: 1, 673: 1, 683: 1, 694: 1, 696: 1, 702: 2, 748: 1, 777: 1, 804: 1, 838: 1, 839: 1, 935: 1, 957: 1, 963: 1, 983: 1, 993: 1, 1014: 1, 1015: 1, 1021: 1, 1029: 1, 1031: 1, 1041: 2, 1051: 1, 1054: 1, 1071: 1, 1072: 1, 1085: 1, 1088: 1, 1107: 1, 1109: 1, 1120: 1, 1138: 1, 1148: 1, 1149: 1, 1164: 1, 1175: 1, 1214: 1, 1234: 1, 1249: 2, 1263: 1, 1317: 1, 1359: 1, 1367: 1, 1398: 1, 1405: 1, 1412: 1, 1428: 1, 1435: 1, 1445: 1, 1482: 1, 1485: 2, 1489: 1, 1493: 1, 1495: 1, 1499: 1, 1501: 1, 1526: 1, 1552: 1, 1560: 1, 1565: 1, 1570: 1, 1571: 1, 1583: 1, 1585: 1, 1592: 1, 1616: 1, 1617: 1, 1624: 1, 1634: 1, 1653: 1, 1655: 1, 1693: 1, 1694: 2, 1698: 1, 1732: 1, 1754: 1, 1756: 1, 1792: 1, 1805: 1, 1814: 2, 1889: 1, 1911: 1, 1913: 1, 1944: 1, 1961: 1, 1968: 1, 1981: 2, 1982: 1, 1983: 1}, 'thi': {0: 1, 3: 1, 9

Query Processing:

In [None]:
# Load the index
index = pt.IndexFactory.of(index_ref)

# Get the meta index, inverted index, and lexicon
meta = index.getMetaIndex()
inv = index.getInvertedIndex()
lex = index.getLexicon()

# Get the lexicon entry for the term "AI"
le = lex.getLexiconEntry("review")

# Initialize a variable to store the total document length
total_document_length = 0

# Check if the lexicon entry is not None
if le is not None:
    # Get the postings for the term "AI"
    postings = inv.getPostings(le)

    # Check if postings are not None
    if postings is not None:
        # Iterate through the postings of the term "AI"
        for posting in postings:
            # Get the document number for the posting
            docno = meta.getItem("docno", posting.getId())

            # Get the frequency of the term in the document
            frequency = posting.getFrequency()

            # Increment the total document length by the frequency of the term in the document
            total_document_length += frequency

        # Print the total document length
        print("Total document length for term 'review':", total_document_length)
    else:
        print("No postings found for term 'review'")
else:
    print("Term 'review' not found in the index")

Total document length for term 'review': 321


In [None]:
meta = index.getMetaIndex()
inv = index.getInvertedIndex()
count=0
lex = index.getLexicon()  # Define the 'lex' object
lit=[]
le = lex.getLexiconEntry("review")
for posting in inv.getPostings(le):
    docno = meta.getItem("docno", posting.getId())
    print("%s with frequency %d" % (docno, posting.getFrequency()))
    lit.append(docno)

ug7v899j with frequency 2
02tnwd4m with frequency 1
2b73a28n with frequency 1
9785vg6d with frequency 1
8qnrcgnk with frequency 1
wnnsmx60 with frequency 1
gdsfkw1b with frequency 1
4cvy9u28 with frequency 1
yz2wbpuu with frequency 1
s64v656n with frequency 1
utglk4af with frequency 1
d3cko4j2 with frequency 1
hwlvk68z with frequency 3
5eqdrd52 with frequency 1
vefs1h6o with frequency 1
5fl0rk90 with frequency 1
lntn11a9 with frequency 1
4y8ghcpq with frequency 1
1n69h3i3 with frequency 1
chz8luni with frequency 3
kfwbqp4p with frequency 1
fae3sczm with frequency 1
xa9a5p0q with frequency 1
ze511t38 with frequency 1
cc5thj1g with frequency 1
rnvh9ut8 with frequency 1
dg3pfydf with frequency 3
oz823tw4 with frequency 1
akvhi38e with frequency 1
05ppugs7 with frequency 1
2ygb80sc with frequency 1
zkudc8ww with frequency 1
t579ysgl with frequency 2
9ofqelrm with frequency 1
1qo1krxv with frequency 1
0jj9svwj with frequency 1
22veehj5 with frequency 1
f0vud3gu with frequency 1
kxtdv6q9 wit

In [None]:
#set up our retieval model by specifing TF_IDF as wmodel and limiting the number of retrieved results for each query top 10 documents
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=10)

#we need to process the query also as we did for documents
query="diseases"
processed_query = preprocess(query)

processed_query = ' '.join(processed_query)

#we will call the search function using our retrieval model we set up above
results= tfidf_retr.search(processed_query)
results

Unnamed: 0,docid,docno,rank,score,qid,query


Query expansion:

In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

fatal: destination path 'terrier-prf' already exists and is not an empty directory.
^C
/content/terrier-prf
/content/terrier-prf
/content


In [None]:
query = "object"
query = preprocess(query)

# Define our retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

result = bm25.search(query)
result

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,1981,okhsvylc,0,6.643123,object
1,1,471,zm6a9brd,1,6.129129,object
2,1,465,7p3b6tyf,2,6.019516,object
3,1,1485,6r25jl11,3,5.632687,object
4,1,1041,2qds187e,4,5.489549,object
5,1,607,dy21jno8,5,5.224041,object
6,1,1445,8s7g2wvd,6,5.113531,object
7,1,702,3m0dyfsk,7,5.050998,object
8,1,572,ufjjengt,8,5.034089,object
9,1,1814,xteeybsj,9,5.021644,object


In [None]:
# "rewrite" function from PyTerrier will be used to expand queries specifying RM3 as the model
# fb_docs ==> no. expansion documents
# fb_terms ==> no. expansion terms
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for query expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query

'applypipeline:off earli^0.052676968 msc^0.032806151 research^0.026819620 organ^0.027690388 commun^0.030650998 50^0.026807882 pulmonari^0.037061237 receiv^0.033402622 object^0.702689826 prrsv^0.029394308'

In [None]:
# Just print the expanded query with term scores
for s in expanded_query.split()[1:]:
  print(s)
print("\n" + query)

earli^0.052676968
msc^0.032806151
research^0.026819620
organ^0.027690388
commun^0.030650998
50^0.026807882
pulmonari^0.037061237
receiv^0.033402622
object^0.702689826
prrsv^0.029394308

object


In [None]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

#Let's check the tweets text for the top 5 retrieved tweets
corpusDf[['text']][corpusDf['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
  docid_1 score_1  docid_2   score_2
0                      471  6.981974
1                     1981  6.953455
2                      465  6.384349
3                     1485  6.279047
4                     1041  6.242746


Unnamed: 0,text
465,Background and objective: The ‘attack rate’ of asthma following viral lower respiratory tract infections (LRTI) is about 3–4 fold higher than that...
471,OBJECTIVE: The World Health organization received reports of 478 laboratory-confirmed cases of influenza A (H5N1) from 15 countries between Novemb...
1041,The disproportionate effects of the 2009 H1N1 pandemic on many Canadian Aboriginal communities have drawn attention to the vulnerability of these ...
1445,BACKGROUND: Recent studies have demonstrated that mesenchymal stem cells (MSCs) modulate the immune response and reduce lung injury in animal mode...
1485,Pigs are often colonized by more than one bacterial and/or viral species during respiratory tract infections. This phenomenon is known as the porc...
1981,OBJECTIVES: The objective of this study was to evaluate the effectiveness of a nonsurgical treatment regimen in the long-term control of necrotic ...


In [None]:
# Function 3a4an ageeb kol el terms el fe el Document

def sk(text):

 docs_dictionary = []

 i = 0

 for docs in text:

   docs_dictionary.append(word_tokenize(text[i]))
   i = i +1

 return docs_dictionary


coll = sk(corpusDf["processed_text"])

from scipy import spatial
import gensim
from gensim.models import Word2Vec


# Train skip-gram model
model = Word2Vec(sentences = coll,
                 sg=1,
                 vector_size=100,
                 window=2,
                 min_count=1,
                 workers=4,
                 epochs=20)

# Get word embeddings
word_embeddings = model.wv

# Expand the query usnig (most_similar) function

qrr = "review"
qrr = preprocess(qrr)
mo = word_embeddings.most_similar(qrr)
m = mo[0][0]
print(f"Expanded Query is: {qrr} {m}")

Expanded Query is: ['review'] briefli


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


In [None]:
# Load the ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Define the sentences from our corpus
queryyy1 = "Recent studies have suggested that statins, an established drug group, aid in the prevention of cardiovascular mortality."
queryyy2 = "Preclinical studies have shown that statins, particularly simvastatin, can prevent growth in breast cancer cell lines and animal models."
sentences = [queryyy1, queryyy2]

# Preprocess the sentences
def preprocess_sentence(sentence):
    # Add your preprocessing steps here, e.g., lowercasing, removing punctuation, etc.
    return sentence

queryyy1 = preprocess_sentence(queryyy1)
queryyy2 = preprocess_sentence(queryyy2)

# Generate ELMo embeddings for the sentences
embeddings = elmo.signatures["default"](tf.constant([queryyy1, queryyy2]))["elmo"]

# Get the ELMo embeddings for the word "studies" in both sentences
studies_queryyy1_embedding = embeddings.numpy()[0][1]
studies_queryyy2_embedding = embeddings.numpy()[1][1]

# Print the embedding vectors
print("Embedding vector for 'studies' (queryyy1):", studies_queryyy1_embedding)
print("Embedding vector for 'studies' (queryyy2):", studies_queryyy2_embedding)


Embedding vector for 'studies' (queryyy1): [-1.3654245  -0.44197986  0.56410885 ...  0.01154742  0.44611573
 -0.48691306]
Embedding vector for 'studies' (queryyy2): [-0.8880007   0.21953833  0.39791662 ... -0.15490828  0.605111
 -0.43554777]


User Interface

In [None]:
!pip install flask_ngrok



In [None]:
df2 = corpusDf.head(50)

df2 = df2.to_dict()

df2

{'docno': {0: 'ug7v899j',
  1: '02tnwd4m',
  2: 'ejv2xln0',
  3: '2b73a28n',
  4: '9785vg6d',
  5: 'zjufx4fo',
  6: '5yhe786e',
  7: '8zchiykl',
  8: '8qnrcgnk',
  9: 'jg13scgo',
  10: '5tkvsudh',
  11: '6lvn10f4',
  12: 'tvxpckxo',
  13: 'mcuixluu',
  14: '6iu1dtyl',
  15: 't35n7bk9',
  16: 'eiqypt0m',
  17: 'sgmk96vr',
  18: 'di0fcy0j',
  19: '4k8f7ou1',
  20: 'wnnsmx60',
  21: 'gdsfkw1b',
  22: 'yba7mdtb',
  23: 'bbvxu8op',
  24: 'e62cfqt7',
  25: '4cvy9u28',
  26: 'zowp10ts',
  27: '5dk231qs',
  28: 'snqdma0s',
  29: '1pq6dkl5',
  30: '754nln40',
  31: 'p34ezktf',
  32: 'l3z27806',
  33: 'yz2wbpuu',
  34: 'kvhoa2se',
  35: 'cgl34ykt',
  36: 'ajlctjeb',
  37: 'cl9gpt9w',
  38: 't40ybhgb',
  39: 'zwbc7nnn',
  40: '1r65yam5',
  41: 'oa4lzkru',
  42: 'qva0jt86',
  43: 'vw8xjo9t',
  44: 'bnnl700a',
  45: 'm71xkuo9',
  46: 's64v656n',
  47: 'oluq7v0h',
  48: 'tw6wusxe',
  49: '58czem0j'},
 'text': {0: 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical fea

In [None]:
def sui(df2 , que):
 i = 0

 quer = preprocess(que)

 docs_id = []

 for key, value in df2.items():
   if key == 'processed_text':
         val = value.values()
         for doc in val:
           terms = doc.split()
           for term in terms:
             if term == quer and i not in docs_id:
               docs_id.append(f'''Document number {i} -----> \n{corpusDf["text"][i]}''')
           i = i + 1
 return docs_id

In [None]:
query2 = "properties"

x = sui(df2 , query2)
x

['Document number 1 -----> \nInflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to infl

In [None]:
from google.colab.output import eval_js
print (eval_js("google.colab.kernel.proxyPort(5000)"))

https://nqn8q445e5l-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok

# Assuming you've already defined the sui function and imported necessary modules

app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def home():
    return """
    <style>
        body {
            background-color: white;
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
        }

        .header {
            background-color: black;
            color: white;
            padding: 20px 0;
        }

        .container {
            text-align: center;
        }

        h1 {
            text-align: center;
            margin: 0;
            padding: 10px 0;
        }

        #searchInput {
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            margin-bottom: 10px;
            width: 300px; /* Adjust the width as needed */
            box-sizing: border-box; /* Include padding and border in the element's total width */
            transition: border-color 0.3s; /* Smooth transition for border color change */
        }

        #searchInput:focus {
            border-color: #4CAF50; /* Change border color on focus */
        }

        button {
            padding: 10px 20px;
            background-color: #FF69B4;
            color: white;
            border: none;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            cursor: pointer;
            transition: background-color 0.3s; /* Smooth transition for background color change */
        }

        button:hover {
            background-color: #333; /* Change background color on hover */
        }
    </style>

    <div class="header">
        <h1>TREC-COVID's Search Engine</h1>
    </div>
    <div class="container">
        <input type="text" id="searchInput" placeholder="Enter your query...">
        <button onclick="search()">Search</button>
    </div>
    <div id="searchResult"></div>

    <script>
        function search() {
            var searchTerm = document.getElementById("searchInput").value;
            fetch('/search', {
                method: 'POST',
                body: JSON.stringify({ query: searchTerm }),
                headers:{
                    'Content-Type': 'application/json'
                }
            })
            .then(response => response.json())
            .then(data => {
                console.log("Received data:", data); // Debug: Check if data is received
                var resultDiv = document.getElementById("searchResult");
                resultDiv.innerHTML = "<h2>Relevant Documents IDs:</h2>";
                if (data.results.length === 0) {
                    resultDiv.innerHTML += "<p>No documents found</p>";
                } else {
                    data.results.forEach(doc => {
                        console.log("Displaying document:", doc); // Debug: Check if document is displayed
                        resultDiv.innerHTML += "<p>" + doc + "</p>";
                    });
                }
            })
            .catch(error => {
                console.error('Error occurred during fetch:', error); // Debug: Log fetch errors
            });
        }
    </script>
    """

@app.route("/search", methods=['POST'])
def search():
    query = request.json['query']
    print("Received query:", query)  # Debug: Check if Flask receives the query
    results = sui(df2, query)
    print("Search results:", results)  # Debug: Check if sui function returns results
    return {'results': results}

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-25:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 497, in _make_request
    conn.request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3

Received query: name
Search results: ['Document number 24 -----> \nUsing differential display RT-PCR, we identified a gene of 2750 bp from human adult testis, named H-Lse, which encoded a putative protein of 523 amino acids and molecular weight of 58 kd with structural characteristics similar to that of mouse lysosome sialic-acid-specific 9-O-acetylesterase. Northern blot analysis showed a widespread distribution of H-Lse in various human tissues with high expression in the testis, prostate, and colon. In situ hybridization results showed that while H-Lse was not detected in embryonic testis, positive signals were found in spermatocytes but not spermatogonia in adult testis of human. The subcellular localization of H-Lse was visualized by green fluorescent protein (GFP) fused to the amino terminus of H-Lse, showing compartmentalization of H-Lse in large dense-core vesicles, presumably lysosomes, in the cytoplasm. The developmentally regulated and spermatogenic stage-specific expression

Evaluation

In [None]:
x=pd.read_csv(queries_csv_path)

In [None]:
vv=pd.read_csv(test_csv_path)

In [None]:
import pandas as pd

# Load the data
vv=pd.read_csv(test_csv_path)

# Renaming the columns to match the other DataFrame
vv.columns = ['qid', 'docno', 'label']

# Convert 'qid' and 'docno' to integers by extracting numbers and converting
vv['qid'] = vv['qid']
vv['docno'] = vv['docno'].str.extract('(\d+)').astype(str)

# Explicitly convert 'label' to integer
vv['label'] = vv['label'].astype(int)
vv['qid'] = vv['label'].astype(str)
vv['docno'] = vv['docno'].astype(str)


# Check data types to ensure all are integers
print(vv.dtypes)

# Optionally, save the corrected DataFrame
vv.to_csv('/content/corrected_test.csv', index=False)  # Ensure the path is correct

# Display the first few rows to verify
print(vv.head())

qid      object
docno    object
label     int64
dtype: object
  qid docno  label
0   2   005      2
1   1    00      1
2   2     7      2
3   1  0194      1
4   1   021      1


In [None]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

stop_words = stopwords.words('english')

def preprocess(text):
    """Tokenizes, removes stopwords, and applies stemming to the text."""
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@\w*", " ", text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    tokens = word_tokenize(text)
    return [stemmer.stem(token) for token in tokens if token not in stop_words]

In [None]:
# Load data
docs_df = pd.read_csv(queries_csv_path)
# Apply preprocessing to document content
docs_df['processed_content'] = docs_df['text'].apply(lambda x: ' '.join(preprocess(str(x))))
docs_df["docno"] = docs_df.index.astype(str)

# Build an inverted index

indexer = pt.DFIndexer("./index", overwrite=True)
index_ref = indexer.index(docs_df['processed_content'], docs_df['docno'])
index = pt.IndexFactory.of(index_ref)

In [None]:
def search(query, k=10):
    processed_query = preprocess(query)
    processed_query_str = " ".join(processed_query)
    tf = pt.BatchRetrieve(index, wmodel="TF_IDF")
    results = tf.transform(processed_query_str)
    # evaluation=pt.Evaluate(results,x)
    return results
ss=search("covid")

  results = tf.transform(processed_query_str)


In [None]:
def tf(processed_docs):
    tf_dict = {}
    for doc_id, doc_text in enumerate(processed_docs):
        doc_tokens = doc_text.split()
        for term in doc_tokens:
            if term not in tf_dict:
                tf_dict[term] = {}
            if doc_id not in tf_dict[term]:
                tf_dict[term][doc_id] = 0
            tf_dict[term][doc_id] += 1
    return tf_dict

processed_docs = docs_df['processed_content'].tolist()
tf_dict = tf(processed_docs)
print(tf_dict)

{'origin': {0: 1}, 'covid': {0: 1, 3: 1, 5: 1, 7: 1, 8: 1, 9: 1, 13: 1, 17: 1, 18: 1, 19: 1, 21: 1, 22: 1, 23: 1, 25: 1, 26: 1, 27: 1, 29: 1, 32: 1, 33: 1, 34: 1, 37: 1, 38: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1}, '19': {0: 1, 3: 1, 5: 1, 7: 1, 8: 1, 9: 1, 13: 1, 17: 1, 18: 1, 19: 1, 21: 1, 22: 1, 23: 1, 25: 1, 26: 1, 27: 1, 29: 1, 32: 1, 33: 1, 34: 1, 37: 1, 38: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1}, 'coronaviru': {1: 1, 6: 1, 10: 1, 12: 1, 14: 1, 15: 1, 16: 1, 30: 1}, 'respond': {1: 1}, 'chang': {1: 1}, 'weather': {1: 1}, 'sar': {2: 1, 4: 2, 28: 1, 31: 1, 35: 1, 36: 1, 39: 1, 49: 1}, 'cov2': {2: 1}, 'infect': {2: 1, 10: 1, 17: 1, 24: 1, 26: 1, 48: 1}, 'peopl': {2: 1}, 'develop': {2: 1, 5: 1}, 'immun': {2: 1, 48: 2}, 'cross': {2: 1}, 'protect': {2: 1}, 'possibl': {2: 1}, 'caus': {3: 1}, 'death': {3: 1}, 'drug': {4: 1, 28: 2}, 'activ': {4: 1}, 'cov': {4: 2, 28: 1, 31: 1, 35: 1, 36: 1, 39: 1, 49: 1}, '2': {4: 1, 28: 1, 31: 1, 35: 1,

In [None]:
pt.Evaluate(ss,vv)

{'map': 0.012419871794871794, 'ndcg': 0.02781819393175804}

In [None]:
eval = pt.Evaluate(ss,vv,metrics=["map","recall","P"], perquery=True)
eval

defaultdict(dict,
            {'1': {'map': 0.049679487179487176,
              'P@5': 1.0,
              'P@10': 1.0,
              'P@15': 1.0,
              'P@20': 1.0,
              'P@30': 1.0,
              'P@100': 0.31,
              'P@200': 0.155,
              'P@500': 0.062,
              'P@1000': 0.031,
              'R@5': 0.008012820512820512,
              'R@10': 0.016025641025641024,
              'R@15': 0.02403846153846154,
              'R@20': 0.03205128205128205,
              'R@30': 0.04807692307692308,
              'R@100': 0.049679487179487176,
              'R@200': 0.049679487179487176,
              'R@500': 0.049679487179487176,
              'R@1000': 0.049679487179487176},
             '-1': {'map': 0.0,
              'P@10': 0.0,
              'P@100': 0.0,
              'P@1000': 0.0,
              'P@15': 0.0,
              'P@20': 0.0,
              'P@200': 0.0,
              'P@30': 0.0,
              'P@5': 0.0,
              'P@500': 0.0,
   