# Search

Note: Add language filter

## Setup

In [None]:
from health_topic_index.analysis.parsing import XmlParser

In [None]:
xml_file_path = '../data/mplus_topics_full.xml'

with open(xml_file_path, mode='rb') as file:
    document_dict = XmlParser.parse_file(file)

health_topics_dict = document_dict['health-topics']
health_topics = health_topics_dict['health-topic']
id_and_summary = [(ht['id'], ht['full-summary']) for ht in health_topics]
documents = [summary for _, summary in id_and_summary]
id_to_title = {ht['id']: ht['title'] for ht in health_topics}

In [None]:
query = "feces"

## Cosine Similarity

In [None]:
from sentence_transformers import SentenceTransformer, util

MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME, cache_folder=".cache_models")
summary_embeddings = model.encode([summary for _, summary in id_and_summary])

In [None]:
query_embedding = model.encode(query)
cosine_scores = util.pytorch_cos_sim(query_embedding, summary_embeddings)[0].tolist()

valid_indices = [i for i in range(len(cosine_scores)) if cosine_scores[i] > 0.3]
valid_ids = [ias[0] for i, ias in enumerate(id_and_summary) if i in valid_indices]
valid_hts = [ht for ht in health_topics if ht['id'] in valid_ids]

for ht in valid_hts:
    print(ht['title'])
    print()

## BM25

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

# Tokenize documents and build BM25
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

In [None]:
# Query
tokenized_query = word_tokenize(query.lower())
scores = bm25.get_scores(tokenized_query)

# Define a minimum relevance score threshold
threshold = 3.3

# Sort indices by scores in descending order
sorted_indices = np.argsort(scores)[::-1]

# Filter sorted indices to only keep those with scores above the threshold
filtered_sorted_indices = [i for i in sorted_indices if scores[i] >= threshold]

# Retrieve valid IDs and corresponding health topics in sorted order
sorted_ids = [id_and_summary[i][0] for i in filtered_sorted_indices]
sorted_scores = [scores[i] for i in filtered_sorted_indices]
sorted_titles = [id_to_title[id] for id in sorted_ids]

# Create a pandas DataFrame
pd.DataFrame({
    "Title": sorted_titles,
    "Score": sorted_scores
})

### Full results

In [None]:
# Retrieve valid IDs and corresponding health topics in sorted order
sorted_ids = [id_and_summary[i][0] for i in sorted_indices]
sorted_scores = [scores[i] for i in sorted_indices]
sorted_titles = [id_to_title[id] for id in sorted_ids]

# Create a pandas DataFrame
df = pd.DataFrame({
    "Title": sorted_titles,
    "Score": sorted_scores
})

## BM25 + WordNet

In [None]:
language = 'eng' # or 'spa' for Spanish

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import nltk
import numpy as np
import pandas as pd

nltk.download('wordnet')
nltk.download('omw-1.4')

# Tokenize documents and build BM25
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

In [None]:
# Function to expand a query with synonyms
def expand_with_synonyms(tokenized_query):
    expanded_query = set(tokenized_query)

    for token in tokenized_query:
        for syn in wn.synsets(token, lang=language):
            for lemma in syn.lemmas(language):
                expanded_query.add(lemma.name().replace("_", " "))

    return list(expanded_query)

In [None]:
# Query
tokenized_query = word_tokenize(query.lower())
expanded_query = expand_with_synonyms(tokenized_query)
scores = bm25.get_scores(expanded_query)

# Define a minimum relevance score threshold
threshold = 3.3 # Should probably just list all above zero in order

# Sort indices by scores in descending order
sorted_indices = np.argsort(scores)[::-1]

# Filter sorted indices to only keep those with scores above the threshold
filtered_sorted_indices = [i for i in sorted_indices if scores[i] >= threshold]

# Retrieve valid IDs and corresponding health topics in sorted order
sorted_ids = [id_and_summary[i][0] for i in filtered_sorted_indices]
sorted_scores = [scores[i] for i in filtered_sorted_indices]
sorted_titles = [id_to_title[id] for id in sorted_ids]

# Create a pandas DataFrame
pd.DataFrame({
    "Title": sorted_titles,
    "Score": sorted_scores
})

### Full results

In [None]:
# Retrieve valid IDs and corresponding health topics in sorted order
sorted_ids = [id_and_summary[i][0] for i in sorted_indices]
sorted_scores = [scores[i] for i in sorted_indices]
sorted_titles = [id_to_title[id] for id in sorted_ids]

# Create a pandas DataFrame
df = pd.DataFrame({
    "Title": sorted_titles,
    "Score": sorted_scores
})