## Requirements

In [None]:
import pandas as pd
import numpy as np
import gensim
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from rank_bm25 import BM25Okapi  # BM25 library
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss  # For approximate nearest neighbor search
import torch
from bm25s import tokenize, BM25

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## VSM Models

### TF-IDF Logarithmic
> lfu
* logarithmic frequency (l)
* idf (f)
* Pivoted unique normalization (u)

In [7]:
class QuerySpecificTFIDFModelLogarithmic:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.dictionary = None
        self.tfidf_model = None
        self.index = None

    def preprocess_text(self, text):
        """
        Preprocesses text by tokenizing, removing stopwords, and lemmatizing.
        """

        ######## THIS PART IS CHANGED FROM word_tokenize to .split() it is 2x faster!!!
        ############################

        
        tokens = text.lower().split()  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]  # Remove stop words and punctuation
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
        return tokens

    def load_and_preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents from a CSV file.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df = df.dropna(subset=["pmid", "title", "abstract"])  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        df["raw_text"] = df["title"] + " " + df["abstract"]
        df["tokens"] = df["raw_text"].apply(self.preprocess_text)
        return df

    def build_tfidf_model(self, documents):
        """
        Builds the TF-IDF model and similarity index using Gensim.
        """
        print("Building TF-IDF model...")
        self.dictionary = Dictionary(documents["tokens"])  # Create a Gensim dictionary
        corpus = [self.dictionary.doc2bow(tokens) for tokens in documents["tokens"]]  # Convert to bag-of-words format
        self.tfidf_model = TfidfModel(corpus,smartirs='lfu')  # Build the TF-IDF model
        self.index = SparseMatrixSimilarity(self.tfidf_model[corpus], num_features=len(self.dictionary))  # Build similarity index
        return corpus

    def calculate_relevance(self, query, corpus):
        """
        Calculates relevance scores for the query against the corpus.
        """
        print("Calculating relevance scores...")
        query_tokens = self.preprocess_text(query)
        query_bow = self.dictionary.doc2bow(query_tokens)  # Convert query to bag-of-words
        query_tfidf = self.tfidf_model[query_bow]  # Convert query to TF-IDF
        similarities = self.index[query_tfidf]  # Compute similarities
        return similarities

    def rank_snippets(self, query, top_documents, top_n_snippets=10):
        """
        Extracts and ranks snippets globally based on similarity to the query.
        """
        print("Ranking snippets globally...")
        snippets = []
        query_tokens = self.preprocess_text(query)

        for _, doc in top_documents.iterrows():
            pmid, text = doc["pmid"], doc["raw_text"]
            sentences = sent_tokenize(text)
            for sentence in sentences:
                sentence_tokens = self.preprocess_text(sentence)
                sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                sentence_tfidf = self.tfidf_model[sentence_bow]
                snippet_score = sum(
                    score for term_id, score in sentence_tfidf if term_id in [self.dictionary.token2id.get(token) for token in query_tokens]
                )
                snippets.append({
                    "text": sentence,
                    "source": pmid,
                    "score": snippet_score,
                })

        # Sort snippets globally
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves the top N relevant documents and globally ranked snippets.
        """
        # Load and preprocess documents
        documents = self.load_and_preprocess_documents(file_path)
        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Build the TF-IDF model
        corpus = self.build_tfidf_model(documents)

        # Calculate relevance scores
        relevance_scores = self.calculate_relevance(query, corpus)

        # Retrieve the indices of the top N documents
        print("Retrieving top documents...")
        top_indices = np.argsort(relevance_scores)[-top_n_docs:][::-1]  # Get indices of top N scores in descending order

        # Create a DataFrame with the top N documents
        top_documents = documents.iloc[top_indices].copy()
        top_documents["score"] = [relevance_scores[idx] for idx in top_indices]

        # Rank snippets globally
        top_snippets = self.rank_snippets(query, top_documents, top_n_snippets)

        return top_documents, top_snippets

In [8]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

# Instantiate the model
model = QuerySpecificTFIDFModelLogarithmic()

# Get relevant documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(query, file_path, top_n_docs=10, top_n_snippets=10)

# Display results
print("Top Documents:")
for doc in top_documents.to_dict('records'):
    print(f"PMID: {doc['pmid']}, Score: {doc['score']:.4f}")

print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Snippet: {snippet['text']}, Score: {snippet['score']:.4f}")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Top Documents:
PMID: 2143, Score: 0.3345
PMID: 1126, Score: 0.1153
PMID: 1990, Score: 0.1044
PMID: 2142, Score: 0.0994
PMID: 1629, Score: 0.0837
PMID: 1468, Score: 0.0811
PMID: 938, Score: 0.0750
PMID: 884, Score: 0.0728
PMID: 2145, Score: 0.0690
PMID: 1792, Score: 0.0619

Top Snippets:
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid pH than at neutral pH., Score: 0.4212
Snippet: Interferon heated to 80degree C could not be reactivated at 40degree C or 55degree C. Interferon of higher apparent molecular weight was more heat-stable than that 

### TF-IDF Raw
> nfc
* raw term frequency (n)
* idf (f)
* cosine normalization (c)

In [9]:
class QuerySpecificTFIDFModelRaw:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.dictionary = None
        self.tfidf_model = None
        self.index = None

    def preprocess_text(self, text):
        """
        Preprocesses text by tokenizing, removing stopwords, and lemmatizing.
        """
        tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]  # Remove stop words and punctuation
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
        return tokens

    def load_and_preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents from a CSV file.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df = df.dropna(subset=["pmid", "title", "abstract"])  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        df["raw_text"] = df["title"] + " " + df["abstract"]
        df["tokens"] = df["raw_text"].apply(self.preprocess_text)
        return df

    def build_tfidf_model(self, documents):
        """
        Builds the TF-IDF model and similarity index using Gensim.
        """
        print("Building TF-IDF model...")
        self.dictionary = Dictionary(documents["tokens"])  # Create a Gensim dictionary
        corpus = [self.dictionary.doc2bow(tokens) for tokens in documents["tokens"]]  # Convert to bag-of-words format
        self.tfidf_model = TfidfModel(corpus,smartirs='nfc')  # Build the TF-IDF model
        self.index = SparseMatrixSimilarity(self.tfidf_model[corpus], num_features=len(self.dictionary))  # Build similarity index
        return corpus

    def calculate_relevance(self, query, corpus):
        """
        Calculates relevance scores for the query against the corpus.
        """
        print("Calculating relevance scores...")
        query_tokens = self.preprocess_text(query)
        query_bow = self.dictionary.doc2bow(query_tokens)  # Convert query to bag-of-words
        query_tfidf = self.tfidf_model[query_bow]  # Convert query to TF-IDF
        similarities = self.index[query_tfidf]  # Compute similarities
        return similarities

    def rank_snippets(self, query, top_documents, top_n_snippets=10):
        """
        Extracts and ranks snippets globally based on similarity to the query.
        """
        print("Ranking snippets globally...")
        snippets = []
        query_tokens = self.preprocess_text(query)

        for _, doc in top_documents.iterrows():
            pmid, text = doc["pmid"], doc["raw_text"]
            sentences = sent_tokenize(text)
            for sentence in sentences:
                sentence_tokens = self.preprocess_text(sentence)
                sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                sentence_tfidf = self.tfidf_model[sentence_bow]
                snippet_score = sum(
                    score for term_id, score in sentence_tfidf if term_id in [self.dictionary.token2id.get(token) for token in query_tokens]
                )
                snippets.append({
                    "text": sentence,
                    "source": pmid,
                    "score": snippet_score,
                })

        # Sort snippets globally
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves the top N relevant documents and globally ranked snippets.
        """
        # Load and preprocess documents
        documents = self.load_and_preprocess_documents(file_path)
        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Build the TF-IDF model
        corpus = self.build_tfidf_model(documents)

        # Calculate relevance scores
        relevance_scores = self.calculate_relevance(query, corpus)

        # Retrieve the indices of the top N documents
        print("Retrieving top documents...")
        top_indices = np.argsort(relevance_scores)[-top_n_docs:][::-1]  # Get indices of top N scores in descending order

        # Create a DataFrame with the top N documents
        top_documents = documents.iloc[top_indices].copy()
        top_documents["score"] = [relevance_scores[idx] for idx in top_indices]

        # Rank snippets globally
        top_snippets = self.rank_snippets(query, top_documents, top_n_snippets)

        return top_documents, top_snippets

In [10]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

# Instantiate the model
model = QuerySpecificTFIDFModelRaw()

# Get relevant documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(query, file_path, top_n_docs=10, top_n_snippets=10)

# Display results
print("Top Documents:")
for doc in top_documents.to_dict('records'):
    print(f"PMID: {doc['pmid']}, Score: {doc['score']:.4f}")

print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Snippet: {snippet['text']}, Score: {snippet['score']:.4f}")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Top Documents:
PMID: 2143, Score: 0.5932
PMID: 1126, Score: 0.1220
PMID: 1990, Score: 0.0874
PMID: 2142, Score: 0.0862
PMID: 884, Score: 0.0831
PMID: 1628, Score: 0.0769
PMID: 1604, Score: 0.0712
PMID: 1468, Score: 0.0643
PMID: 1847, Score: 0.0633
PMID: 1629, Score: 0.0616

Top Snippets:
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid pH than at neutral pH., Score: 0.6735
Snippet: It is postulated that the physicochemical alterations in the aqueous environment significantly affecting the stability of interferon operate by producing changes i

### TF-IDF Augmented
> afc
* augmented (a)
* idf (f)
* cosine normalization (c)

In [11]:
class QuerySpecificTFIDFModelAugmented:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.dictionary = None
        self.tfidf_model = None
        self.index = None

    def preprocess_text(self, text):
        """
        Preprocesses text by tokenizing, removing stopwords, and lemmatizing.
        """
        tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]  # Remove stop words and punctuation
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
        return tokens

    def load_and_preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents from a CSV file.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df = df.dropna(subset=["pmid", "title", "abstract"])  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        df["raw_text"] = df["title"] + " " + df["abstract"]
        df["tokens"] = df["raw_text"].apply(self.preprocess_text)
        return df

    def build_tfidf_model(self, documents):
        """
        Builds the TF-IDF model and similarity index using Gensim.
        """
        print("Building TF-IDF model...")
        self.dictionary = Dictionary(documents["tokens"])  # Create a Gensim dictionary
        corpus = [self.dictionary.doc2bow(tokens) for tokens in documents["tokens"]]  # Convert to bag-of-words format
        self.tfidf_model = TfidfModel(corpus,smartirs='afc')  # Build the TF-IDF model
        self.index = SparseMatrixSimilarity(self.tfidf_model[corpus], num_features=len(self.dictionary))  # Build similarity index
        return corpus

    def calculate_relevance(self, query, corpus):
        """
        Calculates relevance scores for the query against the corpus.
        """
        print("Calculating relevance scores...")
        query_tokens = self.preprocess_text(query)
        query_bow = self.dictionary.doc2bow(query_tokens)  # Convert query to bag-of-words
        query_tfidf = self.tfidf_model[query_bow]  # Convert query to TF-IDF
        similarities = self.index[query_tfidf]  # Compute similarities
        return similarities

    def rank_snippets(self, query, top_documents, top_n_snippets=10):
        """
        Extracts and ranks snippets globally based on similarity to the query.
        """
        print("Ranking snippets globally...")
        snippets = []
        query_tokens = self.preprocess_text(query)

        for _, doc in top_documents.iterrows():
            pmid, text = doc["pmid"], doc["raw_text"]
            sentences = sent_tokenize(text)
            for sentence in sentences:
                sentence_tokens = self.preprocess_text(sentence)
                sentence_bow = self.dictionary.doc2bow(sentence_tokens)
                sentence_tfidf = self.tfidf_model[sentence_bow]
                snippet_score = sum(
                    score for term_id, score in sentence_tfidf if term_id in [self.dictionary.token2id.get(token) for token in query_tokens]
                )
                snippets.append({
                    "text": sentence,
                    "source": pmid,
                    "score": snippet_score,
                })

        # Sort snippets globally
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves the top N relevant documents and globally ranked snippets.
        """
        # Load and preprocess documents
        documents = self.load_and_preprocess_documents(file_path)
        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Build the TF-IDF model
        corpus = self.build_tfidf_model(documents)

        # Calculate relevance scores
        relevance_scores = self.calculate_relevance(query, corpus)

        # Retrieve the indices of the top N documents
        print("Retrieving top documents...")
        top_indices = np.argsort(relevance_scores)[-top_n_docs:][::-1]  # Get indices of top N scores in descending order

        # Create a DataFrame with the top N documents
        top_documents = documents.iloc[top_indices].copy()
        top_documents["score"] = [relevance_scores[idx] for idx in top_indices]

        # Rank snippets globally
        top_snippets = self.rank_snippets(query, top_documents, top_n_snippets)

        return top_documents, top_snippets

In [12]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

# Instantiate the model
model = QuerySpecificTFIDFModelAugmented()

# Get relevant documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(query, file_path, top_n_docs=10, top_n_snippets=10)

# Display results
print("Top Documents:")
for doc in top_documents.to_dict('records'):
    print(f"PMID: {doc['pmid']}, Score: {doc['score']:.4f}")

print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Snippet: {snippet['text']}, Score: {snippet['score']:.4f}")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Top Documents:
PMID: 2143, Score: 0.2059
PMID: 1990, Score: 0.1604
PMID: 1628, Score: 0.1348
PMID: 2142, Score: 0.1306
PMID: 1126, Score: 0.1048
PMID: 938, Score: 0.0894
PMID: 1685, Score: 0.0771
PMID: 1792, Score: 0.0764
PMID: 670, Score: 0.0690
PMID: 334, Score: 0.0628

Top Snippets:
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid pH than at neutral pH., Score: 0.5516
Snippet: The rate of cooling of heated interferon significantly influenced its residual activity., Score: 0.5165
Snippet: The viral titer reached a maximum of 10(6.75) TCID50

## Probabilistic Models

### BM25

In [13]:
class BM25ModelWithPandas:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        """
        Preprocesses text by removing stopwords, applying lowercase, and lemmatization.
        If for_bm25 is True, returns a list of tokens for BM25. Otherwise, returns a single string.
        """
        tokens = text.lower().split()  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]  # Remove stop words and punctuation
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization only
        return tokens 

    def load_documents(self, file_path):
        """
        Loads CSV data using Pandas and preprocesses the `title` and `abstract` fields.
        Assumes the file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df = df.dropna(subset=['pmid', 'title', 'abstract'])  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        df['raw_text'] = df['title'] + " " + df['abstract']
        df['preprocessed_text'] = df['raw_text'].apply(self.preprocess_text)
        
        return df

    def calculate_bm25_scores(self, query_tokens, documents):
        """
        Uses BM25 to calculate relevance scores between the query and all documents.
        """
        bm25 = BM25Okapi(list(documents['preprocessed_text']))
        relevance_scores = bm25.get_scores(query_tokens)
        return relevance_scores

    def get_top_snippets(self, query_tokens, top_documents, top_n=10):
        """
        Extracts and ranks snippets globally from the top documents using BM25.
        """
        snippets = []
        
        for _, doc in top_documents.iterrows():
            pmid, abstract = doc['pmid'], doc['raw_text']
            sentences = sent_tokenize(abstract)  # Split the abstract into sentences
            
            for sentence in sentences:
                preprocessed_sentence = self.preprocess_text(sentence)
                snippets.append({"text": sentence, "tokens": preprocessed_sentence, "source": pmid})
        
        # Combine all snippets into a single list for BM25
        snippet_texts = [snippet["tokens"] for snippet in snippets]
        bm25 = BM25Okapi(snippet_texts)
        snippet_scores = bm25.get_scores(query_tokens)
        
        # Add scores to snippets and sort them globally
        for i, snippet in enumerate(snippets):
            snippet["score"] = snippet_scores[i]
        
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents
        documents = self.load_documents(file_path)

        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Preprocess the query
        query_tokens = self.preprocess_text(query)

        # Calculate BM25 relevance scores
        relevance_scores = self.calculate_bm25_scores(query_tokens, documents)
        documents['score'] = relevance_scores

        # Get top N documents
        top_documents = documents.nlargest(top_n_docs, 'score')

        # Rank snippets globally from top documents
        top_snippets = self.get_top_snippets(query_tokens, top_documents, top_n_snippets)

        # Return both top documents and snippets
        return top_documents, top_snippets

In [14]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Define the query
query = "Effects of interferon on viral infections"

# Initialize the BM25 model
bm25_model = BM25ModelWithPandas()

# Retrieve top documents and snippets
top_documents, top_snippets = bm25_model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print the top documents
print("\nTop Documents:")
for _, doc in top_documents.iterrows():
    print(f"PMC ID: {doc['pmid']}, Score: {doc['score']:.4f}")

# Print the top snippets
print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Source: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv

Top Documents:
PMC ID: 2143, Score: 13.0717
PMC ID: 1126, Score: 9.4150
PMC ID: 2142, Score: 8.3898
PMC ID: 1990, Score: 6.2001
PMC ID: 938, Score: 6.1988
PMC ID: 884, Score: 5.9103
PMC ID: 1468, Score: 5.7332
PMC ID: 1628, Score: 5.7323
PMC ID: 1792, Score: 5.7181
PMC ID: 1629, Score: 5.5253

Top Snippets:
Source: 2143
Snippet: Rapid cooling and sudden freezing decreased the residual activities of interferons at pH 2 and 9 more than "normal" cooling, an effect not observed at pH 7.
Score: 4.3776

Source: 1792
Snippet: Effect of environmental pH on adenovirus-associated virus.
Score: 4.1220

Source: 2143
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid 

### BM25S- BM25 with Sparse Matrix

In [33]:
class BM25SDocumentRetriever:
    def __init__(self):
        """
        Initializes the BM25SnippetRetriever with the dataset file path.
        """
        self.df = None
        self.retriever = None
        self.corpus_tokens = None

    def load_data(self,file_path):
        """
        Loads the dataset and preprocesses the text.
        """
        print("Loading dataset...")
        self.df = pd.read_csv(file_path)
        self.df["text"] = self.df["title"] + " " + self.df["abstract"]
        self.corpus_tokens = tokenize(self.df["text"].tolist())

    def build_corpus_index(self):
        """
        Initializes the BM25 retriever and indexes the tokenized corpus.
        """
        print("Building corpus index...")
        self.retriever = BM25()
        self.retriever.index(self.corpus_tokens)

    def retrieve_top_documents(self, query, k=10):
        """
        Retrieves the top-k most relevant documents for the given query.
        """
        print("Retrieving top documents...")
        query_tokens = tokenize(query)
        docs, scores = self.retriever.retrieve(query_tokens, k=k)

        # Map document indices back to the dataset
        top_docs_indices = docs[0]
        top_docs_scores = scores[0]

        # Extract the top documents with their scores
        top_documents = [
            {"text": self.df.iloc[idx]["text"], "score": score, "pmid": self.df.iloc[idx]["pmid"]}
            for idx, score in zip(top_docs_indices, top_docs_scores)
        ]
        return top_documents

    def retrieve_top_snippets(self, query, top_documents, k=10):
        """
        Retrieves the top-k most relevant snippets globally from the given top documents.
        """
        print("Retrieving top snippets...")
        query_tokens = tokenize(query)

        # Consolidate all snippets with document IDs
        all_snippets = []
        for doc in top_documents:
            sentences = sent_tokenize(doc["text"])  # Split the document into sentences (snippets)
            for sentence in sentences:
                all_snippets.append({"text": sentence, "source": doc["pmid"]})

        # Tokenize all snippets
        snippet_tokens = tokenize([snippet["text"] for snippet in all_snippets])

        # Re-index the BM25 retriever with snippets
        self.retriever.index(snippet_tokens)

        # Retrieve the top k most relevant snippets globally
        snippet_docs, snippet_scores = self.retriever.retrieve(query_tokens, k=k)

        # Extract the top k snippets with scores and source document IDs
        top_snippets = [
            {
                "text": all_snippets[idx]["text"],
                "score": snippet_scores[0, i],
                "source": all_snippets[idx]["source"],
            }
            for i, idx in enumerate(snippet_docs[0])
        ]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query,file_path, top_n_docs=10, top_n_snippets=10):
        """
        Main method to retrieve top documents and top snippets for a given query.
        """
        # Load the dataset and build the index
        self.load_data(file_path)
        self.build_corpus_index()

        # Retrieve top documents
        top_documents = self.retrieve_top_documents(query, k=top_n_docs)

        # Print top documents
        print("\nTop 10 Documents:")
        for doc in top_documents:
            print(f"PMC ID: {doc['pmid']}, Score: {doc['score']:.2f}")
            print(f"Text: {doc['text'][:200]}...")  # Print a snippet of the document text
            print("-" * 80)

        # Retrieve top snippets
        top_snippets = self.retrieve_top_snippets(query, top_documents, k=top_n_snippets)

        # Print top snippets
        print("\nTop 10 Snippets:")
        for snippet in top_snippets:
            print(f"Source Document: {snippet['source']}")
            print(f"Snippet: {snippet['text']}")
            print(f"Score: {snippet['score']:.2f}")
            print("-" * 80)

        return top_documents, top_snippets

In [34]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

bm25_model = BM25SDocumentRetriever()
top_documents, top_snippets = bm25_model.get_relevant_documents_and_snippets(query=query,file_path=file_path, top_n_docs=10, top_n_snippets=10)

Loading dataset...


                                                      

Building corpus index...


                                                             

Retrieving top documents...


                                                     


Top 10 Documents:
PMC ID: 2143, Score: 5.13
Text: The influence of physicochemical factors on the thermal inactivation of murine interferon. The degradation of biological activity of virus-induced murine interferon was determined in linear nonisother...
--------------------------------------------------------------------------------
PMC ID: 1990, Score: 4.91
Text: Correlation between molecular size and interferon- inducing activity of poly I:C. Electron microscopy showed that commerical poly I: C consisted of molecules varying in length from less than 0.05 nm t...
--------------------------------------------------------------------------------
PMC ID: 1126, Score: 3.54
Text: The specificity of heterophil antibodies in patients and healthy donors with no or minimal signs of infectious mononucleosis. Over several years sera were collected from 14 heterophil-positive student...
--------------------------------------------------------------------------------
PMC ID: 1604, Score: 2.76
Text

                                                            


Top 10 Snippets:
Source Document: 884
Snippet: Comparative study of virological infections in asthmatic and nonasthmatic children.
Score: 1.39
--------------------------------------------------------------------------------
Source Document: 331
Snippet: Thus, pneumococci exert several dose-dependent thromboplastic effects: (i) release of platelet thromboplastic substances; (ii) a direct thromboplastic effect; and (iii) release of polymorphonuclear coagulant.
Score: 1.36
--------------------------------------------------------------------------------
Source Document: 1604
Snippet: The significance of mosquito longevity and blood-feeding behaviour in the dynamics of arbovirus infections.
Score: 1.29
--------------------------------------------------------------------------------
Source Document: 1468
Snippet: The mixtures of viral fragments exhibited an increased deacetylase activity.
Score: 1.23
--------------------------------------------------------------------------------
Source Doc



## Semantic Retrieval Models

### Semantic IR Model- With Speed Ups

In [17]:
class SemanticRetrievalModelWithPandasOptimized:
    def __init__(self, model_name='sentence-transformers/paraphrase-MiniLM-L3-v2', use_gpu=True):
        """
        Initialize the semantic model using a pre-trained Sentence-BERT model.
        """
        device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        self.embedding_model = SentenceTransformer(model_name, device=device)

    def preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents using Pandas. Assumes the CSV file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        df = pd.read_csv(file_path)
        df.dropna(subset=['pmid', 'title', 'abstract'], inplace=True)  # Drop rows with missing values

        # Combine title and abstract into a single text column
        df['text'] = df['title'] + " " + df['abstract']
        return df

    def encode_batch_text(self, texts):
        """
        Encodes a batch of texts into embeddings using the Sentence-BERT model.
        """
        embeddings = self.embedding_model.encode(texts, convert_to_tensor=True, batch_size=32)
        return embeddings.cpu()  # Ensure embeddings are moved to CPU

    def build_faiss_index(self, embeddings):
        """
        Builds a FAISS index for approximate nearest neighbor search.
        """
        embeddings_np = np.array(embeddings)  # Convert embeddings to NumPy array
        d = embeddings_np.shape[1]  # Dimension of embeddings
        index = faiss.IndexFlatL2(d)
        index.add(embeddings_np)  # Add embeddings to the index
        return index

    def search_faiss_index(self, index, query_embedding, top_k=10):
        """
        Searches the FAISS index for the top-k most similar embeddings.
        """
        query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1)  # Move query embedding to CPU
        distances, indices = index.search(query_embedding_np, top_k)
        return distances[0], indices[0]

    def filter_documents(self, query, documents, embeddings, top_n_docs=10):
        """
        Filters documents using approximate semantic similarity.
        """
        query_embedding = self.encode_batch_text([query])
        index = self.build_faiss_index(embeddings)
        distances, indices = self.search_faiss_index(index, query_embedding, top_n_docs)
        
        top_documents = [
            {
                "pmc_id": documents.iloc[i]["pmid"],
                "text": documents.iloc[i]["text"],
                "score": 1 / (1 + distances[idx]),  # Inverse scaling to normalize distance
                "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{documents.iloc[i]['pmid']}/",
            }
            for idx, i in enumerate(indices)
        ]
        return top_documents, query_embedding

    def rank_snippets(self, query_embedding, top_documents, top_n_snippets=10):
        """
        Ranks snippets globally based on semantic similarity.
        """
        snippets = []
        for doc in top_documents:
            sentences = sent_tokenize(doc["text"])  # Split the document into sentences
            snippet_embeddings = self.encode_batch_text(sentences)
            snippet_scores = cosine_similarity(query_embedding.cpu().numpy(), snippet_embeddings.cpu().numpy())[0]
            
            for i, sentence in enumerate(sentences):
                snippets.append({
                    "text": sentence,
                    "source": doc["pmc_id"],
                    "score": snippet_scores[i],
                })

        # Sort snippets globally by score
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents with Pandas
        df = self.preprocess_documents(file_path)

        if len(df) == 0:
            print("No valid documents found.")
            return [], []

        # Encode document embeddings in batches
        print("Encoding document embeddings...")
        embeddings = np.array([embedding.cpu().numpy() for embedding in self.encode_batch_text(df['text'].tolist())])

        # Filter top documents using semantic similarity
        print("Filtering top documents...")
        top_documents, query_embedding = self.filter_documents(query, df, embeddings, top_n_docs=top_n_docs)

        # Rank snippets globally
        print("Ranking snippets globally...")
        top_snippets = self.rank_snippets(query_embedding, top_documents, top_n_snippets=top_n_snippets)

        return top_documents, top_snippets

In [18]:
# Initialize the model
model = SemanticRetrievalModelWithPandasOptimized()

# Define the query and CSV file path
query = "Multiple sclerosis treatments using interferon"
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Retrieve top documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print the top 10 documents
print("\nTop 10 Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmc_id']}, URL: {doc['url']}, Score: {doc['score']:.4f}")

# Print the top 10 snippets
print("\nTop 10 Snippets:")
for snippet in top_snippets:
    print(f"Source: https://www.ncbi.nlm.nih.gov/pmc/articles/{snippet['source']}/")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Filtering top documents...
Ranking snippets globally...

Top 10 Documents:
PMC ID: 1295, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1295/, Score: 0.0398
PMC ID: 2143, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2143/, Score: 0.0397
PMC ID: 1628, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1628/, Score: 0.0389
PMC ID: 1990, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1990/, Score: 0.0379
PMC ID: 1996, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1996/, Score: 0.0379
PMC ID: 725, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC725/, Score: 0.0373
PMC ID: 1801, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1801/, Score: 0.0371
PMC ID: 2396, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2396/, Score: 0.0371
PMC ID: 1296, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1296/, Score: 0.0364
PMC

## Time Commplexity

In [32]:
import time

# Path to the CSV file
csv_file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Query to use for comparison
query = "Multiple sclerosis treatments using interferon"

# Number of documents and snippets to retrieve
top_n_docs = 10
top_n_snippets = 10

# List of models to compare
models = [
    {
        "name": "QuerySpecificTFIDFModelLogarithmic",
        "class": QuerySpecificTFIDFModelLogarithmic,
    },
    {
        "name": "QuerySpecificTFIDFModelRaw",
        "class": QuerySpecificTFIDFModelRaw,
    },
    {
        "name": "QuerySpecificTFIDFModelAugmented",
        "class": QuerySpecificTFIDFModelAugmented,
    },
    {
        "name": "BM25ModelWithPandas",
        "class": BM25ModelWithPandas,
    },
    {
        "name": "SemanticRetrievalModelWithPandasOptimized",
        "class": SemanticRetrievalModelWithPandasOptimized,
    },
    {
        "name": "BM25SparseMatrixVersion",
        "class": BM25SDocumentRetriever,
    },
]

# Function to measure execution time of a model
def measure_execution_time(model_class, csv_file_path, query, top_n_docs, top_n_snippets):
    model = model_class()  # Instantiate the model
    start_time = time.time()
    model.get_relevant_documents_and_snippets(
        query, csv_file_path, top_n_docs=top_n_docs, top_n_snippets=top_n_snippets
    )
    end_time = time.time()
    elapsed_time = end_time - start_time
    return elapsed_time

# Compare the models
results = []
for model_info in models:
    model_name = model_info["name"]
    model_class = model_info["class"]
    print(f"Evaluating {model_name}...")
    elapsed_time = measure_execution_time(
        model_class, csv_file_path, query, top_n_docs, top_n_snippets
    )
    results.append({"Model": model_name, "Time (seconds)": elapsed_time})

# Print results
print("\nComparison Results:")
for result in results:
    print(f"Model: {result['Model']}, Time Taken: {result['Time (seconds)']:.2f} seconds")

Evaluating QuerySpecificTFIDFModelLogarithmic...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating QuerySpecificTFIDFModelRaw...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating QuerySpecificTFIDFModelAugmented...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Building TF-IDF model...
Calculating relevance scores...
Retrieving top documents...
Ranking snippets globally...
Evaluating BM25ModelWithPandas...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_a

                                                      

Building corpus index...


                                                             

Retrieving top documents...


                                                     


Top 10 Documents:
PMC ID: 2143, Score: 6.41
Text: The influence of physicochemical factors on the thermal inactivation of murine interferon. The degradation of biological activity of virus-induced murine interferon was determined in linear nonisother...
--------------------------------------------------------------------------------
PMC ID: 1990, Score: 4.91
Text: Correlation between molecular size and interferon- inducing activity of poly I:C. Electron microscopy showed that commerical poly I: C consisted of molecules varying in length from less than 0.05 nm t...
--------------------------------------------------------------------------------
PMC ID: 537, Score: 2.77
Text: Multiple cyclic nucleotide phosphodiesterases in rat kidney. Using DEAE-cellulose chromatography and Agarose gel filtration we have partially purified a low Km cyclic adenosine monophosphate (AMP) pho...
--------------------------------------------------------------------------------
PMC ID: 159, Score: 2.60
Text: 

                                                            


Top 10 Snippets:
Source Document: 2143
Snippet: The degradation of biological activity of virus-induced murine interferon was determined in linear nonisothermal and multiple isothermal tests.
Score: 1.69
--------------------------------------------------------------------------------
Source Document: 1268
Snippet: The alterations of the RNA molecules due to the various treatments were monitored by sucrose density gradients.
Score: 1.44
--------------------------------------------------------------------------------
Source Document: 451
Snippet: Our studies suggest the possibility of using Limulus hemocyanin and other hemocyanins as structural homologs and analogs of more complex macromolecular arrays.
Score: 1.23
--------------------------------------------------------------------------------
Source Document: 377
Snippet: Comparative studies on multiple forms of cathepsin A.
Score: 1.18
--------------------------------------------------------------------------------
Source Document: 1

