## Requirements

In [5]:
import pandas as pd
import json
from dask import dataframe as dd
import numpy as np
import gensim
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from gensim.similarities import SparseMatrixSimilarity
from rank_bm25 import BM25Okapi  # BM25 library
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss  # For approximate nearest neighbor search
import torch
from bm25s import tokenize, BM25
from typing import List

## Loading the Data

### Loading the data and query documents -- Run the 2 cells below for having a global dataframe

In [232]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def preprocess_text(text):
    # Use regular expressions to split text into words
    words = simple_preprocess(text)
    # Remove stopwords and lemmatize
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]


def load_and_preprocess_documents(file_path):
    print(f"Loading documents from {file_path}")
    ddf = dd.read_csv(file_path)
    print("Dropping missing rows...")
    # Drop rows with missing values in specified columns
    ddf = ddf.dropna(subset=["pmid", "title", "abstract"])
    print("Preprocessing text...")
    ddf["processed_text"] = ddf["abstract"].apply(preprocess_text, meta=("x", "object"))
    df = ddf.compute()
    return df


def load_golden_data(golden_file_path):
    print(f"Loading golden data from {golden_file_path}")
    with open(golden_file_path, "r") as f:
        golden_data = json.load(f)

    # Initialize lists to store the extracted data
    queries = []
    golden_docs = []
    golden_snippets = []

    # Iterate through each question in the JSON data
    for question in golden_data.get("questions", []):
        queries.append(question.get("body", ""))
        golden_docs.append(question.get("documents", []))
        golden_snippets.append(
            [snippet.get("text", "") for snippet in question.get("snippets", [])]
        )

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(
        {
            "query": queries,
            "golden_docs": golden_docs,
            "golden_snippets": golden_snippets,
        }
    )

    return df

In [233]:
golden_file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/dataset/12B4_golden.json"
dataset_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head_1m.csv"
df = load_and_preprocess_documents(dataset_path)
# For query lists of 1 query we should modify the class
query_df = load_golden_data(golden_file_path)
query_documents = query_df["query"]

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head_1m.csv
Dropping missing rows...
Preprocessing text...
Loading golden data from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/dataset/12B4_golden.json


#### Trial Board

In [150]:
# Ensure stopwords are loaded
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Use regular expressions to split text into words
    words = simple_preprocess(text)
    # Remove stopwords and lemmatize
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]


def load_and_preprocess_documents(file_path):
    ddf = dd.read_csv(file_path)
    # Drop rows with missing values in specified columns
    ddf = ddf.dropna(subset=["pmid", "title", "abstract"])
    ddf["processed_text"] = ddf["abstract"].apply(preprocess_text, meta=("x", "object"))
    df = ddf.compute()
    return df


dictionary = Dictionary()
BoW_corpus = [
    dictionary.doc2bow(doc, allow_update=True) for doc in df["processed_text"]
]
tfidf = TfidfModel(BoW_corpus, smartirs="nfc")
tfidf_corpus = tfidf[BoW_corpus]
index = SparseMatrixSimilarity(tfidf_corpus, num_features=len(dictionary))

most_relevant_documents = []
for query in query_documents:
    relevant_documents = []
    query_doc = preprocess_text(query)
    # Convert the query to BoW format
    query_bow = dictionary.doc2bow(query_doc)
    # Transform the query to TF-IDF representation
    query_tfidf = tfidf[query_bow]
    # Compute similarities
    similarities = index[query_tfidf]
    sorted_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    for doc_id, score in sorted_docs[:10]:
        relevant_documents.append(
            f"http://www.ncbi.nlm.nih.gov/pubmed/{df.iloc[doc_id]['pmid']}"
        )
    most_relevant_documents.append(relevant_documents)

## VSM Models

### TF-IDF Logarithmic
> lfu
* logarithmic frequency (l)
* idf (f)
* Pivoted unique normalization (u)

In [249]:
class TF_IDF_Gensim_Log:
    def __init__(self, method="lfu"):
        self.method = method
        self.dictionary = None
        self.tfidf = None
        self.index = None

    def preprocess_text(self, text):
        # Use regular expressions to split text into words
        words = simple_preprocess(text)
        # Remove stopwords and lemmatize
        return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    def create_tfidf_model(self, processed_texts):
        """
        Create a TF-IDF model and similarity index from preprocessed texts.

        Parameters:
        processed_texts (list of list of str): Preprocessed documents, where each document is a list of tokens.
        """
        self.dictionary = Dictionary(processed_texts)
        bow_corpus = [self.dictionary.doc2bow(doc) for doc in processed_texts]
        self.tfidf = TfidfModel(bow_corpus, smartirs=self.method)
        tfidf_corpus = self.tfidf[bow_corpus]
        self.index = SparseMatrixSimilarity(
            tfidf_corpus, num_features=len(self.dictionary)
        )

    def get_most_relevant_documents(self, queries: List[List[str]], top_n=10):
        """
        Retrieve the most relevant documents for each query.

        Parameters:
        queries (list of list of str): List of queries, where each query is a list of preprocessed tokens.
        top_n (int): Number of top relevant documents to retrieve for each query.

        Returns:
        list of list of tuple: Each inner list contains tuples of (document_id, similarity_score) for the top_n documents.
        """
        most_relevant_documents = []
        most_relevant_snippets = []
        for query in queries:
            relevant_documents = []
            snippets = []
            query = self.preprocess_text(query)
            query_bow = self.dictionary.doc2bow(query)
            query_tfidf = self.tfidf[query_bow]
            similarities = self.index[query_tfidf]
            sorted_docs = sorted(
                enumerate(similarities), key=lambda x: x[1], reverse=True
            )
            for doc_id, score in sorted_docs[:top_n]:
                relevant_documents.append(
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{df.iloc[doc_id]['pmid']}"
                )
                # Score can be added later!
            most_relevant_documents.append(relevant_documents)
            for documents in relevant_documents:
                pmid = int(documents.rsplit("/", 1)[-1])
                sentences = sent_tokenize(df[df.pmid == pmid].abstract.values[0])
                for sentence in sentences:
                    sentence_tokens = self.preprocess_text(sentence)
                    sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                    sentence_tfidf = self.tfidf[sentence_bow]
                    snippet_score = sum(
                        score
                        for term_id, score in sentence_tfidf
                        if term_id
                        in [self.dictionary.token2id.get(token) for token in query]
                    )
                    snippets.append(
                        {
                            "text": sentence,
                            "source": pmid,
                            "score": snippet_score,
                        }
                    )
            top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[
                :top_n
            ]
            most_relevant_snippets.append(top_snippets)

        return most_relevant_documents, most_relevant_snippets

### TF-IDF Raw
> nfc
* raw term frequency (n)
* idf (f)
* cosine normalization (c)

In [251]:
class TF_IDF_Gensim_Raw:
    def __init__(self, method="nfc"):
        self.method = method
        self.dictionary = None
        self.tfidf = None
        self.index = None

    def preprocess_text(self, text):
        # Use regular expressions to split text into words
        words = simple_preprocess(text)
        # Remove stopwords and lemmatize
        return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    def create_tfidf_model(self, processed_texts):
        """
        Create a TF-IDF model and similarity index from preprocessed texts.

        Parameters:
        processed_texts (list of list of str): Preprocessed documents, where each document is a list of tokens.
        """
        self.dictionary = Dictionary(processed_texts)
        bow_corpus = [self.dictionary.doc2bow(doc) for doc in processed_texts]
        self.tfidf = TfidfModel(bow_corpus, smartirs=self.method)
        tfidf_corpus = self.tfidf[bow_corpus]
        self.index = SparseMatrixSimilarity(
            tfidf_corpus, num_features=len(self.dictionary)
        )

    def get_most_relevant_documents(self, queries: List[List[str]], top_n=10):
        """
        Retrieve the most relevant documents for each query.

        Parameters:
        queries (list of list of str): List of queries, where each query is a list of preprocessed tokens.
        top_n (int): Number of top relevant documents to retrieve for each query.

        Returns:
        list of list of tuple: Each inner list contains tuples of (document_id, similarity_score) for the top_n documents.
        """
        most_relevant_documents = []
        most_relevant_snippets = []
        for query in queries:
            relevant_documents = []
            snippets = []
            query = self.preprocess_text(query)
            query_bow = self.dictionary.doc2bow(query)
            query_tfidf = self.tfidf[query_bow]
            similarities = self.index[query_tfidf]
            sorted_docs = sorted(
                enumerate(similarities), key=lambda x: x[1], reverse=True
            )
            for doc_id, score in sorted_docs[:top_n]:
                relevant_documents.append(
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{df.iloc[doc_id]['pmid']}"
                )
                # Score can be added later!
            most_relevant_documents.append(relevant_documents)
            for documents in relevant_documents:
                pmid = int(documents.rsplit("/", 1)[-1])
                sentences = sent_tokenize(df[df.pmid == pmid].abstract.values[0])
                for sentence in sentences:
                    sentence_tokens = self.preprocess_text(sentence)
                    sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                    sentence_tfidf = self.tfidf[sentence_bow]
                    snippet_score = sum(
                        score
                        for term_id, score in sentence_tfidf
                        if term_id
                        in [self.dictionary.token2id.get(token) for token in query]
                    )
                    snippets.append(
                        {
                            "text": sentence,
                            "source": pmid,
                            "score": snippet_score,
                        }
                    )
            top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[
                :top_n
            ]
            most_relevant_snippets.append(top_snippets)

        return most_relevant_documents, most_relevant_snippets

### TF-IDF Augmented
> afc
* augmented (a)
* idf (f)
* cosine normalization (c)

In [252]:
class TF_IDF_Gensim_Augmented:
    def __init__(self, method="afc"):
        self.method = method
        self.dictionary = None
        self.tfidf = None
        self.index = None

    def preprocess_text(self, text):
        # Use regular expressions to split text into words
        words = simple_preprocess(text)
        # Remove stopwords and lemmatize
        return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    def create_tfidf_model(self, processed_texts):
        """
        Create a TF-IDF model and similarity index from preprocessed texts.

        Parameters:
        processed_texts (list of list of str): Preprocessed documents, where each document is a list of tokens.
        """
        self.dictionary = Dictionary(processed_texts)
        bow_corpus = [self.dictionary.doc2bow(doc) for doc in processed_texts]
        self.tfidf = TfidfModel(bow_corpus, smartirs=self.method)
        tfidf_corpus = self.tfidf[bow_corpus]
        self.index = SparseMatrixSimilarity(
            tfidf_corpus, num_features=len(self.dictionary)
        )

    def get_most_relevant_documents(self, queries: List[List[str]], top_n=10):
        """
        Retrieve the most relevant documents for each query.

        Parameters:
        queries (list of list of str): List of queries, where each query is a list of preprocessed tokens.
        top_n (int): Number of top relevant documents to retrieve for each query.

        Returns:
        list of list of tuple: Each inner list contains tuples of (document_id, similarity_score) for the top_n documents.
        """
        most_relevant_documents = []
        most_relevant_snippets = []
        for query in queries:
            relevant_documents = []
            snippets = []
            query = self.preprocess_text(query)
            query_bow = self.dictionary.doc2bow(query)
            query_tfidf = self.tfidf[query_bow]
            similarities = self.index[query_tfidf]
            sorted_docs = sorted(
                enumerate(similarities), key=lambda x: x[1], reverse=True
            )
            for doc_id, score in sorted_docs[:top_n]:
                relevant_documents.append(
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{df.iloc[doc_id]['pmid']}"
                )
                # Score can be added later!
            most_relevant_documents.append(relevant_documents)
            for documents in relevant_documents:
                pmid = int(documents.rsplit("/", 1)[-1])
                sentences = sent_tokenize(df[df.pmid == pmid].abstract.values[0])
                for sentence in sentences:
                    sentence_tokens = self.preprocess_text(sentence)
                    sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                    sentence_tfidf = self.tfidf[sentence_bow]
                    snippet_score = sum(
                        score
                        for term_id, score in sentence_tfidf
                        if term_id
                        in [self.dictionary.token2id.get(token) for token in query]
                    )
                    snippets.append(
                        {
                            "text": sentence,
                            "source": pmid,
                            "score": snippet_score,
                        }
                    )
            top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[
                :top_n
            ]
            most_relevant_snippets.append(top_snippets)

        return most_relevant_documents, most_relevant_snippets

In [200]:
# Initialize the DocumentSimilarity object
doc_sim = TF_IDF_Gensim_Log(method="lfu")
# Create the TF-IDF model and similarity index
doc_sim.create_tfidf_model(df["processed_text"])
# Retrieve the most relevant documents for each query
doc, snippets = doc_sim.get_most_relevant_documents(query_documents, top_n=10)

## Probabilistic Models

### BM25

In [13]:
class BM25ModelWithPandas:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        """
        Preprocesses text by removing stopwords, applying lowercase, and lemmatization.
        If for_bm25 is True, returns a list of tokens for BM25. Otherwise, returns a single string.
        """
        tokens = text.lower().split()  # Tokenize and convert to lowercase
        tokens = [
            word for word in tokens if word.isalnum() and word not in self.stop_words
        ]  # Remove stop words and punctuation
        tokens = [
            self.lemmatizer.lemmatize(word) for word in tokens
        ]  # Apply lemmatization only
        return tokens

    def load_documents(self, file_path):
        """
        Loads CSV data using Pandas and preprocesses the `title` and `abstract` fields.
        Assumes the file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df = df.dropna(
            subset=["pmid", "title", "abstract"]
        )  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        df["raw_text"] = df["title"] + " " + df["abstract"]
        df["preprocessed_text"] = df["raw_text"].apply(self.preprocess_text)

        return df

    def calculate_bm25_scores(self, query_tokens, documents):
        """
        Uses BM25 to calculate relevance scores between the query and all documents.
        """
        bm25 = BM25Okapi(list(documents["preprocessed_text"]))
        relevance_scores = bm25.get_scores(query_tokens)
        return relevance_scores

    def get_top_snippets(self, query_tokens, top_documents, top_n=10):
        """
        Extracts and ranks snippets globally from the top documents using BM25.
        """
        snippets = []

        for _, doc in top_documents.iterrows():
            pmid, abstract = doc["pmid"], doc["raw_text"]
            sentences = sent_tokenize(abstract)  # Split the abstract into sentences

            for sentence in sentences:
                preprocessed_sentence = self.preprocess_text(sentence)
                snippets.append(
                    {"text": sentence, "tokens": preprocessed_sentence, "source": pmid}
                )

        # Combine all snippets into a single list for BM25
        snippet_texts = [snippet["tokens"] for snippet in snippets]
        bm25 = BM25Okapi(snippet_texts)
        snippet_scores = bm25.get_scores(query_tokens)

        # Add scores to snippets and sort them globally
        for i, snippet in enumerate(snippets):
            snippet["score"] = snippet_scores[i]

        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n]
        return top_snippets

    def get_relevant_documents_and_snippets(
        self, query, file_path, top_n_docs=10, top_n_snippets=10
    ):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents
        documents = self.load_documents(file_path)

        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Preprocess the query
        query_tokens = self.preprocess_text(query)

        # Calculate BM25 relevance scores
        relevance_scores = self.calculate_bm25_scores(query_tokens, documents)
        documents["score"] = relevance_scores

        # Get top N documents
        top_documents = documents.nlargest(top_n_docs, "score")

        # Rank snippets globally from top documents
        top_snippets = self.get_top_snippets(
            query_tokens, top_documents, top_n_snippets
        )

        # Return both top documents and snippets
        return top_documents, top_snippets

In [14]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Define the query
query = "Effects of interferon on viral infections"

# Initialize the BM25 model
bm25_model = BM25ModelWithPandas()

# Retrieve top documents and snippets
top_documents, top_snippets = bm25_model.get_relevant_documents_and_snippets(
    query=query, file_path=file_path, top_n_docs=10, top_n_snippets=10
)

# Print the top documents
print("\nTop Documents:")
for _, doc in top_documents.iterrows():
    print(f"PMC ID: {doc['pmid']}, Score: {doc['score']:.4f}")

# Print the top snippets
print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Source: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv

Top Documents:
PMC ID: 2143, Score: 13.0717
PMC ID: 1126, Score: 9.4150
PMC ID: 2142, Score: 8.3898
PMC ID: 1990, Score: 6.2001
PMC ID: 938, Score: 6.1988
PMC ID: 884, Score: 5.9103
PMC ID: 1468, Score: 5.7332
PMC ID: 1628, Score: 5.7323
PMC ID: 1792, Score: 5.7181
PMC ID: 1629, Score: 5.5253

Top Snippets:
Source: 2143
Snippet: Rapid cooling and sudden freezing decreased the residual activities of interferons at pH 2 and 9 more than "normal" cooling, an effect not observed at pH 7.
Score: 4.3776

Source: 1792
Snippet: Effect of environmental pH on adenovirus-associated virus.
Score: 4.1220

Source: 2143
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid 

### BM25S- BM25 with Sparse Matrix

In [39]:
class BM25SDocumentRetriever:
    def __init__(self):
        """
        Initializes the BM25SnippetRetriever with the dataset file path.
        """
        self.df = None
        self.retriever = None
        self.corpus_tokens = None

    def load_and_preprocess_documents(self, file_path):
        """
        Loads the dataset and preprocesses the text.
        """
        print("Loading dataset...")
        self.df = pd.read_csv(file_path)
        self.df["text"] = self.df["title"] + " " + self.df["abstract"]
        self.corpus_tokens = tokenize(self.df["text"].tolist())

    def build_corpus_index(self):
        """
        Initializes the BM25 retriever and indexes the tokenized corpus.
        """
        print("Building corpus index...")
        self.retriever = BM25()
        self.retriever.index(self.corpus_tokens)

    def retrieve_top_documents(self, query, k=10):
        """
        Retrieves the top-k most relevant documents for the given query.
        """
        print("Retrieving top documents...")
        query_tokens = tokenize(query)
        docs, scores = self.retriever.retrieve(query_tokens, k=k)

        # Map document indices back to the dataset
        top_docs_indices = docs[0]
        top_docs_scores = scores[0]

        # Extract the top documents with their scores
        top_documents = [
            {
                "text": self.df.iloc[idx]["text"],
                "score": score,
                "pmid": self.df.iloc[idx]["pmid"],
            }
            for idx, score in zip(top_docs_indices, top_docs_scores)
        ]
        return top_documents

    def retrieve_top_snippets(self, query, top_documents, k=10):
        """
        Retrieves the top-k most relevant snippets globally from the given top documents.
        """
        print("Retrieving top snippets...")
        query_tokens = tokenize(query)

        # Consolidate all snippets with document IDs
        all_snippets = []
        for doc in top_documents:
            sentences = sent_tokenize(
                doc["text"]
            )  # Split the document into sentences (snippets)
            for sentence in sentences:
                all_snippets.append({"text": sentence, "source": doc["pmid"]})

        # Tokenize all snippets
        snippet_tokens = tokenize([snippet["text"] for snippet in all_snippets])

        # Re-index the BM25 retriever with snippets
        self.retriever.index(snippet_tokens)

        # Retrieve the top k most relevant snippets globally
        snippet_docs, snippet_scores = self.retriever.retrieve(query_tokens, k=k)

        # Extract the top k snippets with scores and source document IDs
        top_snippets = [
            {
                "text": all_snippets[idx]["text"],
                "score": snippet_scores[0, i],
                "source": all_snippets[idx]["source"],
            }
            for i, idx in enumerate(snippet_docs[0])
        ]
        return top_snippets

    def get_relevant_documents_and_snippets(
        self, query, file_path, top_n_docs=10, top_n_snippets=10
    ):
        """
        Main method to retrieve top documents and top snippets for a given query.
        """
        # Load the dataset and build the index
        self.load_and_preprocess_documents(file_path)
        self.build_corpus_index()

        # Retrieve top documents
        top_documents = self.retrieve_top_documents(query, k=top_n_docs)

        # Retrieve top snippets
        top_snippets = self.retrieve_top_snippets(
            query, top_documents, k=top_n_snippets
        )

        return top_documents, top_snippets

In [40]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

bm25_model = BM25SDocumentRetriever()
top_documents, top_snippets = bm25_model.get_relevant_documents_and_snippets(
    query=query, file_path=file_path, top_n_docs=10, top_n_snippets=10
)
print("\nTop 10 Snippets:")
for snippet in top_snippets:
    print(f"Source Document: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.2f}")
    print("-" * 80)
# Print top documents
print("\nTop 10 Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmid']}, Score: {doc['score']:.2f}")
    print(f"Text: {doc['text'][:200]}...")  # Print a snippet of the document text
    print("-" * 80)

Loading dataset...


                                                      

Building corpus index...


                                                             

Retrieving top documents...


                                                     

Retrieving top snippets...


                                                            


Top 10 Snippets:
Source Document: 884
Snippet: Comparative study of virological infections in asthmatic and nonasthmatic children.
Score: 1.39
--------------------------------------------------------------------------------
Source Document: 331
Snippet: Thus, pneumococci exert several dose-dependent thromboplastic effects: (i) release of platelet thromboplastic substances; (ii) a direct thromboplastic effect; and (iii) release of polymorphonuclear coagulant.
Score: 1.36
--------------------------------------------------------------------------------
Source Document: 1604
Snippet: The significance of mosquito longevity and blood-feeding behaviour in the dynamics of arbovirus infections.
Score: 1.29
--------------------------------------------------------------------------------
Source Document: 1468
Snippet: The mixtures of viral fragments exhibited an increased deacetylase activity.
Score: 1.23
--------------------------------------------------------------------------------
Source Doc



## Semantic Retrieval Models

### Semantic IR Model- With Speed Ups

In [17]:
class SemanticRetrievalModelWithPandasOptimized:
    def __init__(
        self, model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", use_gpu=True
    ):
        """
        Initialize the semantic model using a pre-trained Sentence-BERT model.
        """
        device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        self.embedding_model = SentenceTransformer(model_name, device=device)

    def preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents using Pandas. Assumes the CSV file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df.dropna(
            subset=["pmid", "title", "abstract"], inplace=True
        )  # Drop rows with missing values

        # Combine title and abstract into a single text column
        df["text"] = df["title"] + " " + df["abstract"]
        return df

    def encode_batch_text(self, texts):
        """
        Encodes a batch of texts into embeddings using the Sentence-BERT model.
        """
        embeddings = self.embedding_model.encode(
            texts, convert_to_tensor=True, batch_size=32
        )
        return embeddings.cpu()  # Ensure embeddings are moved to CPU

    def build_faiss_index(self, embeddings):
        """
        Builds a FAISS index for approximate nearest neighbor search.
        """
        embeddings_np = np.array(embeddings)  # Convert embeddings to NumPy array
        d = embeddings_np.shape[1]  # Dimension of embeddings
        index = faiss.IndexFlatL2(d)
        index.add(embeddings_np)  # Add embeddings to the index
        return index

    def search_faiss_index(self, index, query_embedding, top_k=10):
        """
        Searches the FAISS index for the top-k most similar embeddings.
        """
        query_embedding_np = (
            query_embedding.cpu().numpy().reshape(1, -1)
        )  # Move query embedding to CPU
        distances, indices = index.search(query_embedding_np, top_k)
        return distances[0], indices[0]

    def filter_documents(self, query, documents, embeddings, top_n_docs=10):
        """
        Filters documents using approximate semantic similarity.
        """
        query_embedding = self.encode_batch_text([query])
        index = self.build_faiss_index(embeddings)
        distances, indices = self.search_faiss_index(index, query_embedding, top_n_docs)

        top_documents = [
            {
                "pmc_id": documents.iloc[i]["pmid"],
                "text": documents.iloc[i]["text"],
                "score": 1
                / (1 + distances[idx]),  # Inverse scaling to normalize distance
                "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{documents.iloc[i]['pmid']}/",
            }
            for idx, i in enumerate(indices)
        ]
        return top_documents, query_embedding

    def rank_snippets(self, query_embedding, top_documents, top_n_snippets=10):
        """
        Ranks snippets globally based on semantic similarity.
        """
        snippets = []
        for doc in top_documents:
            sentences = sent_tokenize(doc["text"])  # Split the document into sentences
            snippet_embeddings = self.encode_batch_text(sentences)
            snippet_scores = cosine_similarity(
                query_embedding.cpu().numpy(), snippet_embeddings.cpu().numpy()
            )[0]

            for i, sentence in enumerate(sentences):
                snippets.append(
                    {
                        "text": sentence,
                        "source": doc["pmc_id"],
                        "score": snippet_scores[i],
                    }
                )

        # Sort snippets globally by score
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[
            :top_n_snippets
        ]
        return top_snippets

    def get_relevant_documents_and_snippets(
        self, query, file_path, top_n_docs=10, top_n_snippets=10
    ):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents with Pandas
        df = self.preprocess_documents(file_path)

        if len(df) == 0:
            print("No valid documents found.")
            return [], []

        # Encode document embeddings in batches
        print("Encoding document embeddings...")
        embeddings = np.array(
            [
                embedding.cpu().numpy()
                for embedding in self.encode_batch_text(df["text"].tolist())
            ]
        )

        # Filter top documents using semantic similarity
        print("Filtering top documents...")
        top_documents, query_embedding = self.filter_documents(
            query, df, embeddings, top_n_docs=top_n_docs
        )

        # Rank snippets globally
        print("Ranking snippets globally...")
        top_snippets = self.rank_snippets(
            query_embedding, top_documents, top_n_snippets=top_n_snippets
        )

        return top_documents, top_snippets

In [18]:
# Initialize the model
model = SemanticRetrievalModelWithPandasOptimized()

# Define the query and CSV file path
query = "Multiple sclerosis treatments using interferon"
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Retrieve top documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(
    query=query, file_path=file_path, top_n_docs=10, top_n_snippets=10
)

# Print the top 10 documents
print("\nTop 10 Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmc_id']}, URL: {doc['url']}, Score: {doc['score']:.4f}")

# Print the top 10 snippets
print("\nTop 10 Snippets:")
for snippet in top_snippets:
    print(f"Source: https://www.ncbi.nlm.nih.gov/pmc/articles/{snippet['source']}/")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Filtering top documents...
Ranking snippets globally...

Top 10 Documents:
PMC ID: 1295, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1295/, Score: 0.0398
PMC ID: 2143, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2143/, Score: 0.0397
PMC ID: 1628, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1628/, Score: 0.0389
PMC ID: 1990, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1990/, Score: 0.0379
PMC ID: 1996, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1996/, Score: 0.0379
PMC ID: 725, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC725/, Score: 0.0373
PMC ID: 1801, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1801/, Score: 0.0371
PMC ID: 2396, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2396/, Score: 0.0371
PMC ID: 1296, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1296/, Score: 0.0364
PMC

## Time Commplexity

In [32]:
import time

# Path to the CSV file
csv_file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Query to use for comparison
query = "Multiple sclerosis treatments using interferon"

# Number of documents and snippets to retrieve
top_n_docs = 10
top_n_snippets = 10

# List of models to compare
models = [
    {
        "name": "QuerySpecificTFIDFModelLogarithmic",
        "class": QuerySpecificTFIDFModelLogarithmic,
    },
    {
        "name": "QuerySpecificTFIDFModelRaw",
        "class": QuerySpecificTFIDFModelRaw,
    },
    {
        "name": "QuerySpecificTFIDFModelAugmented",
        "class": QuerySpecificTFIDFModelAugmented,
    },
    {
        "name": "BM25ModelWithPandas",
        "class": BM25ModelWithPandas,
    },
    {
        "name": "SemanticRetrievalModelWithPandasOptimized",
        "class": SemanticRetrievalModelWithPandasOptimized,
    },
    {
        "name": "BM25SparseMatrixVersion",
        "class": BM25SDocumentRetriever,
    },
]


# Function to measure execution time of a model
def measure_execution_time(
    model_class, csv_file_path, query, top_n_docs, top_n_snippets
):
    model = model_class()  # Instantiate the model
    start_time = time.time()
    model.get_relevant_documents_and_snippets(
        query, csv_file_path, top_n_docs=top_n_docs, top_n_snippets=top_n_snippets
    )
    end_time = time.time()
    elapsed_time = end_time - start_time
    return elapsed_time


# Compare the models
results = []
for model_info in models:
    model_name = model_info["name"]
    model_class = model_info["class"]
    print(f"Evaluating {model_name}...")
    elapsed_time = measure_execution_time(
        model_class, csv_file_path, query, top_n_docs, top_n_snippets
    )
    results.append({"Model": model_name, "Time (seconds)": elapsed_time})

# Print results
print("\nComparison Results:")
for result in results:
    print(
        f"Model: {result['Model']}, Time Taken: {result['Time (seconds)']:.2f} seconds"
    )

Evaluating QuerySpecificTFIDFModelLogarithmic...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating QuerySpecificTFIDFModelRaw...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating QuerySpecificTFIDFModelAugmented...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating BM25ModelWithPandas...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_a

                                                      

Building corpus index...


                                                             

Retrieving top documents...


                                                     


Top 10 Documents:
PMC ID: 2143, Score: 6.41
Text: The influence of physicochemical factors on the thermal inactivation of murine interferon. The degradation of biological activity of virus-induced murine interferon was determined in linear nonisother...
--------------------------------------------------------------------------------
PMC ID: 1990, Score: 4.91
Text: Correlation between molecular size and interferon- inducing activity of poly I:C. Electron microscopy showed that commerical poly I: C consisted of molecules varying in length from less than 0.05 nm t...
--------------------------------------------------------------------------------
PMC ID: 537, Score: 2.77
Text: Multiple cyclic nucleotide phosphodiesterases in rat kidney. Using DEAE-cellulose chromatography and Agarose gel filtration we have partially purified a low Km cyclic adenosine monophosphate (AMP) pho...
--------------------------------------------------------------------------------
PMC ID: 159, Score: 2.60
Text: 

                                                            


Top 10 Snippets:
Source Document: 2143
Snippet: The degradation of biological activity of virus-induced murine interferon was determined in linear nonisothermal and multiple isothermal tests.
Score: 1.69
--------------------------------------------------------------------------------
Source Document: 1268
Snippet: The alterations of the RNA molecules due to the various treatments were monitored by sucrose density gradients.
Score: 1.44
--------------------------------------------------------------------------------
Source Document: 451
Snippet: Our studies suggest the possibility of using Limulus hemocyanin and other hemocyanins as structural homologs and analogs of more complex macromolecular arrays.
Score: 1.23
--------------------------------------------------------------------------------
Source Document: 377
Snippet: Comparative studies on multiple forms of cathepsin A.
Score: 1.18
--------------------------------------------------------------------------------
Source Document: 1



## Evaluation

In [237]:
def extract_ids(url_list):
    for url in url_list:
        yield url.rsplit("/", 1)[-1]  # Extract PubMed ID from the URL


def calculate_macro_metrics(retrieved_docs_list, golden_docs_list):
    """
    Calculate Macro-Averaged Precision, Recall, and F1-Score for multiple queries.

    Parameters:
    retrieved_docs_list (list of lists): A list where each element is a list of retrieved document URLs for a query.
    golden_docs_list (list of lists): A list where each element is a list of golden document URLs for a query.

    Returns:
    dict: Macro precision, recall, and F1-score.
    """
    # Initialize lists to store per-query metrics
    macro_precision = []
    macro_recall = []
    macro_f1 = []

    for retrieved_docs, golden_docs in zip(retrieved_docs_list, golden_docs_list):
        # Convert lists to sets for efficient comparison
        retrieved_set = set(extract_ids(retrieved_docs))
        golden_set = set(extract_ids(golden_docs))

        # Compute intersection
        intersection = retrieved_set & golden_set

        # Metrics for the current query
        precision = len(intersection) / len(retrieved_set) if retrieved_set else 0
        recall = len(intersection) / len(golden_set) if golden_set else 0
        f1 = (
            (2 * precision * recall) / (precision + recall)
            if (precision + recall) > 0
            else 0
        )

        # Store the metrics for macro-averaging
        macro_precision.append(precision)
        macro_recall.append(recall)
        macro_f1.append(f1)

    # Macro-Averaged Metrics
    macro_avg_precision = (
        sum(macro_precision) / len(macro_precision) if macro_precision else 0
    )
    macro_avg_recall = sum(macro_recall) / len(macro_recall) if macro_recall else 0
    macro_avg_f1 = sum(macro_f1) / len(macro_f1) if macro_f1 else 0

    return {
        "macro_avg_precision": macro_avg_precision,
        "macro_avg_recall": macro_avg_recall,
        "macro_avg_f1": macro_avg_f1,
    }

### Document Evaluation

#### Logarithmic VSM

In [253]:
# Initialize the DocumentSimilarity object
doc_sim = TF_IDF_Gensim_Log(method="lfu")
# Create the TF-IDF model and similarity index
doc_sim.create_tfidf_model(df["processed_text"])
# Retrieve the most relevant documents for each query
doc, snippets = doc_sim.get_most_relevant_documents(query_documents, top_n=10)

In [254]:
calculate_macro_metrics(doc, query_df["golden_docs"].to_list())

{'macro_avg_precision': 0.0058823529411764705,
 'macro_avg_recall': 0.0019411764705882354,
 'macro_avg_f1': 0.0029131652661064425}

#### Raw VSM

In [None]:
# Initialize the DocumentSimilarity object
doc_sim = TF_IDF_Gensim_Raw(method="nfc")
# Create the TF-IDF model and similarity index
doc_sim.create_tfidf_model(df["processed_text"])
# Retrieve the most relevant documents for each query
doc, snippets = doc_sim.get_most_relevant_documents(query_documents, top_n=10)
calculate_macro_metrics(doc, query_df["golden_docs"].to_list())

In [None]:
calculate_macro_metrics(doc, query_df["golden_docs"].to_list())

#### Augmented VSM

In [None]:
# Initialize the DocumentSimilarity object
doc_sim = TF_IDF_Gensim_Augmented(method="afc")
# Create the TF-IDF model and similarity index
doc_sim.create_tfidf_model(df["processed_text"])
# Retrieve the most relevant documents for each query
doc, snippets = doc_sim.get_most_relevant_documents(query_documents, top_n=10)
calculate_macro_metrics(doc, query_df["golden_docs"].to_list())

In [None]:
calculate_macro_metrics(doc, query_df["golden_docs"].to_list())