## VSM Models

### VSM- TF-IDF method

In [27]:
import dask.dataframe as dd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import nltk

# Download NLTK resources for stop-words and sentence tokenization
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


class QuerySpecificTFIDFModel:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(
            stop_words='english', smooth_idf=True, sublinear_tf=True,
            max_df=0.8, min_df=2  # Ignore overly common/rare terms
        )
    
    def preprocess_text(self, text):
        """
        Preprocesses text by lowercasing, removing stop words, and lemmatizing.
        """
        tokens = text.lower().split()
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    def load_csv_data(self, file_path):
        """
        Loads CSV data using Dask and extracts the necessary fields (pmid, title, abstract).
        Returns a Dask DataFrame for further processing.
        """
        print(f"Loading CSV file: {file_path}")
        data = dd.read_csv(file_path)

        # Filter rows with missing values in critical columns
        data = data.dropna(subset=["pmid", "title", "abstract"])

        # Combine title and abstract into a single column for raw text
        data["raw_text"] = data["title"] + " " + data["abstract"]

        # Preprocess the raw text and store it in a new column
        data["preprocessed_text"] = data["raw_text"].map(self.preprocess_text, meta=("raw_text", "str"))

        return data

    def calculate_relevance_with_tfidf(self, query, documents):
        """
        Uses TF-IDF to calculate relevance scores directly.
        """
        # Convert Dask DataFrame to a list for TF-IDF vectorization
        texts = documents["preprocessed_text"].compute().tolist()
        tfidf_matrix = self.vectorizer.fit_transform(texts)  # Keep sparse format

        # Transform query into the same TF-IDF vector space
        query_vector = self.vectorizer.transform([self.preprocess_text(query)])

        # Calculate cosine similarity
        relevance_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

        # Map relevance scores back to document indices
        doc_scores = {idx: relevance_scores[idx] for idx in range(len(texts))}
        return doc_scores

    def rank_snippets(self, query, top_documents, top_n_snippets=10):
        """
        Extracts and ranks snippets globally from the top documents based on relevance.
        """
        snippets = []
        query_vector = self.vectorizer.transform([self.preprocess_text(query)])

        for doc in top_documents:
            doc_id, raw_text = doc["pmc_id"], doc["raw_text"]  # Use raw text for snippets
            
            # Split raw text into sentences
            sentences = sent_tokenize(raw_text)
            
            # Preprocess each sentence for scoring but keep the raw version for display
            preprocessed_sentences = [self.preprocess_text(sentence) for sentence in sentences]
            sentence_vectors = self.vectorizer.transform(preprocessed_sentences)  # Transform preprocessed sentences
            
            # Compute cosine similarity
            snippet_scores = cosine_similarity(query_vector, sentence_vectors).flatten()

            # Collect all snippets from the document
            snippets.extend([
                {"text": sentences[i], "source": doc_id, "score": snippet_scores[i]}
                for i in range(len(sentences))
            ])

        # Sort all snippets globally
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Processes CSV file using Dask, scores documents based on relevance to the query, 
        and returns the top N most relevant documents and snippets.
        """
        # Load documents from the CSV file
        documents = self.load_csv_data(file_path)

        if len(documents) == 0:
            print("No valid documents found.")
            return [], []
        
        # Calculate relevance scores using optimized TF-IDF
        relevance_scores = self.calculate_relevance_with_tfidf(query, documents)
        
        # Get the top N relevant documents by sorting relevance scores
        sorted_docs = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)[:top_n_docs]
        top_documents = [
            {
                "pmc_id": documents.loc[idx, "pmid"].compute().iloc[0],  # Extract the first value
                "text": documents.loc[idx, "preprocessed_text"].compute().iloc[0],  # Extract the first value
                "raw_text": documents.loc[idx, "raw_text"].compute().iloc[0],  # Extract the first value
                "score": score,
            }
            for idx, score in sorted_docs
        ]
        # Rank and retrieve top snippets
        top_snippets = self.rank_snippets(query, top_documents, top_n_snippets)

        return top_documents, top_snippets

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
# Define the file path and query
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"
query = "Effects of interferon on viral infections"

# Initialize the model
model = QuerySpecificTFIDFModel()

# Retrieve top documents and snippets
top_documents,top_snippets = model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print results
print("\nTop Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmc_id']}, Score: {doc['score']:.4f}")

print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Source: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading CSV file: /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv

Top Documents:
PMC ID: 2143, Score: 0.2842
PMC ID: 1126, Score: 0.1463
PMC ID: 2142, Score: 0.1234
PMC ID: 884, Score: 0.1220
PMC ID: 1990, Score: 0.1067
PMC ID: 938, Score: 0.1020
PMC ID: 1629, Score: 0.0997
PMC ID: 1792, Score: 0.0946
PMC ID: 670, Score: 0.0905
PMC ID: 2145, Score: 0.0900

Top Snippets:
Source: 2143
Snippet: Interferon heated to 80degree C could not be reactivated at 40degree C or 55degree C. Interferon of higher apparent molecular weight was more heat-stable than that with lower apparent molecular weight.
Score: 0.4360

Source: 2143
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid pH than at neutral pH.
Score: 0.4071

Source: 2143
Snippet

## Probabilistic Models

### BM25

In [59]:
import os
import dask.dataframe as dd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import nltk
from rank_bm25 import BM25Okapi  # Import BM25 from rank_bm25 library

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


class BM25ModelWithDask:
    def __init__(self):
        # Initialize stop words and lemmatizer
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        """
        Preprocesses text by removing stopwords, applying lowercase, and lemmatization.
        If for_bm25 is True, returns a list of tokens for BM25. Otherwise, returns a single string.
        """
        tokens = text.lower().split()  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]  # Remove stop words and punctuation
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization only
        return tokens 

    def load_documents(self, file_path):
        """
        Loads CSV data using Dask and preprocesses the `title` and `abstract` fields.
        Assumes the file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        ddf = dd.read_csv(file_path)
        ddf = ddf.dropna(subset=['pmid', 'title', 'abstract'])  # Drop rows with missing values

        # Combine title and abstract, and preprocess
        ddf['raw_text'] = ddf['title'] + " " + ddf['abstract']
        ddf['preprocessed_text'] = ddf['raw_text'].map(self.preprocess_text, meta=('preprocessed_text', 'object'))
        
        return ddf

    def calculate_bm25_scores(self, query_tokens, documents):
        """
        Uses BM25 to calculate relevance scores between the query and all documents.
        """
        bm25 = BM25Okapi(list(documents['preprocessed_text']))
        relevance_scores = bm25.get_scores(query_tokens)
        return relevance_scores

    def get_top_snippets(self, query_tokens, top_documents, top_n=10):
        """
        Extracts and ranks snippets globally from the top documents using BM25.
        """
        snippets = []
        
        for _, doc in top_documents.iterrows():
            pmid, abstract = doc['pmid'], doc['raw_text']
            sentences = sent_tokenize(abstract)  # Split the abstract into sentences
            
            for sentence in sentences:
                preprocessed_sentence = self.preprocess_text(sentence)
                snippets.append({"text": sentence, "tokens": preprocessed_sentence, "source": pmid})
        
        # Combine all snippets into a single list for BM25
        snippet_texts = [snippet["tokens"] for snippet in snippets]
        bm25 = BM25Okapi(snippet_texts)
        snippet_scores = bm25.get_scores(query_tokens)
        
        # Add scores to snippets and sort them globally
        for i, snippet in enumerate(snippets):
            snippet["score"] = snippet_scores[i]
        
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents
        documents = self.load_documents(file_path).compute()  # Convert Dask DataFrame to pandas DataFrame

        if len(documents) == 0:
            print("No valid documents found.")
            return [], []

        # Preprocess the query
        query_tokens = self.preprocess_text(query)

        # Calculate BM25 relevance scores
        relevance_scores = self.calculate_bm25_scores(query_tokens, documents)
        documents['score'] = relevance_scores

        # Get top N documents
        top_documents = documents.nlargest(top_n_docs, 'score')

        # Rank snippets globally from top documents
        top_snippets = self.get_top_snippets(query_tokens, top_documents, top_n_snippets)

        # Return both top documents and snippets
        return top_documents, top_snippets


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Define the query
query = "Effects of interferon on viral infections"

# Initialize the BM25 model
bm25_model = BM25ModelWithDask()

# Retrieve top documents and snippets
top_documents, top_snippets = bm25_model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print the top documents
print("\nTop Documents:")
for _, doc in top_documents.iterrows():
    print(f"PMC ID: {doc['pmid']}, Score: {doc['score']:.4f}")

# Print the top snippets
print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Source: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv

Top Documents:
PMC ID: 2143, Score: 13.0717
PMC ID: 1126, Score: 9.4150
PMC ID: 2142, Score: 8.3898
PMC ID: 1990, Score: 6.2001
PMC ID: 938, Score: 6.1988
PMC ID: 884, Score: 5.9103
PMC ID: 1468, Score: 5.7332
PMC ID: 1628, Score: 5.7323
PMC ID: 1792, Score: 5.7181
PMC ID: 1629, Score: 5.5253

Top Snippets:
Source: 2143
Snippet: Rapid cooling and sudden freezing decreased the residual activities of interferons at pH 2 and 9 more than "normal" cooling, an effect not observed at pH 7.
Score: 4.3776

Source: 1792
Snippet: Effect of environmental pH on adenovirus-associated virus.
Score: 4.1220

Source: 2143
Snippet: The stabilizing effect of pH during heating on interferon in solution was greatest at low pH, such that pH 2 greater than pH 5 greater than pH 7 greater than or equal to pH 9; freeze-dried preparations of interferon were also more heat-stable at acid 

## Semantic Retrieval Models

### Semantic IR Model- Normal (No optimization)

In [43]:
import dask.dataframe as dd
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Download NLTK resources for sentence tokenization
nltk.download('punkt')


class SemanticRetrievalModelWithDask:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize the semantic model using a pre-trained Sentence-BERT model.
        """
        self.embedding_model = SentenceTransformer(model_name)

    def preprocess_documents(self, file_path):
        """
        Loads CSV data using Dask and preprocesses the `title` and `abstract` fields.
        Assumes the file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        ddf = dd.read_csv(file_path)
        ddf = ddf.dropna(subset=['pmid', 'title', 'abstract'])  # Drop rows with missing values

        # Combine title and abstract into a single column
        ddf['text'] = ddf['title'] + " " + ddf['abstract']
        return ddf

    def encode_text_batch(self, texts):
        """
        Encodes a batch of texts into embeddings using Sentence-BERT.
        """
        return self.embedding_model.encode(texts, convert_to_tensor=True, batch_size=32)

    def calculate_relevance(self, query_embedding, document_embeddings):
        """
        Calculates cosine similarity between the query embedding and document embeddings.
        """
        similarities = cosine_similarity(query_embedding, document_embeddings)
        return similarities[0]

    def rank_snippets(self, query_embedding, documents, top_n_snippets=10):
        """
        Extracts and ranks snippets globally based on semantic similarity to the query.
        """
        snippets = []
        
        for _, doc in documents.iterrows():
            pmid, text = doc['pmid'], doc['text']
            sentences = sent_tokenize(text)  # Split the document into sentences
            
            # Encode snippets and compute similarity
            snippet_embeddings = self.encode_text_batch(sentences).cpu().numpy()
            snippet_scores = cosine_similarity(query_embedding, snippet_embeddings)

            # Store snippets with their scores
            for i, sentence in enumerate(sentences):
                snippets.append({
                    "text": sentence,
                    "source": pmid,
                    "score": snippet_scores[0][i]
                })

        # Sort snippets globally by score
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents
        ddf = self.preprocess_documents(file_path).compute()  # Convert Dask DataFrame to pandas DataFrame

        if len(ddf) == 0:
            print("No valid documents found.")
            return [], []

        # Encode documents into embeddings
        print("Encoding document embeddings...")
        document_embeddings = np.array(
            [embedding.cpu().numpy() for embedding in self.encode_text_batch(ddf['text'].tolist())]
        )

        # Encode the query into an embedding
        print("Encoding query embedding...")
        query_embedding = self.encode_text_batch([query]).cpu().numpy()

        # Calculate relevance scores
        print("Calculating relevance scores...")
        relevance_scores = self.calculate_relevance(query_embedding, document_embeddings)

        # Add scores to the DataFrame
        ddf['score'] = relevance_scores

        # Get top N documents
        top_documents_df = ddf.nlargest(top_n_docs, 'score')
        top_documents = [
            {
                "pmc_id": row['pmid'],
                "text": row['text'],
                "score": row['score'],
                "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{row['pmid']}/",
            }
            for _, row in top_documents_df.iterrows()
        ]

        # Rank snippets globally from top documents
        print("Ranking snippets globally...")
        top_snippets = self.rank_snippets(query_embedding, top_documents_df, top_n_snippets)

        # Return both top documents and snippets
        return top_documents, top_snippets

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
# Initialize the model
model = SemanticRetrievalModelWithDask()

# Define the query and file path
query = "Multiple sclerosis treatments using interferon"
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Get the top 10 documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print top documents
print("\nTop Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmc_id']}, Score: {doc['score']:.4f}, URL: {doc['url']}")

print("\nTop Snippets:")
for snippet in top_snippets:
    print(f"Source: {snippet['source']}")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Encoding query embedding...
Calculating relevance scores...
Ranking snippets globally...

Top Documents:
PMC ID: 1990, Score: 0.3160, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1990/
PMC ID: 706, Score: 0.3121, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC706/
PMC ID: 1145, Score: 0.2733, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1145/
PMC ID: 978, Score: 0.2686, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC978/
PMC ID: 2021, Score: 0.2676, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2021/
PMC ID: 1960, Score: 0.2562, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1960/
PMC ID: 2143, Score: 0.2554, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2143/
PMC ID: 914, Score: 0.2519, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC914/
PMC ID: 2110, Score: 0.2515, URL: https://www.ncbi.nlm.nih.gov

### Semantic IR Model- With Speed Ups

In [61]:
import dask.dataframe as dd
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss  # For approximate nearest neighbors
import torch
import nltk

# Download NLTK resources for sentence tokenization
nltk.download('punkt')


class SemanticRetrievalModelWithDaskOptimized:
    def __init__(self, model_name='sentence-transformers/paraphrase-MiniLM-L3-v2', use_gpu=True):
        """
        Initialize the semantic model using a pre-trained Sentence-BERT model.
        """
        device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        self.embedding_model = SentenceTransformer(model_name, device=device)

    def preprocess_documents(self, file_path):
        """
        Loads and preprocesses documents using Dask. Assumes the CSV file has columns `pmid`, `title`, and `abstract`.
        """
        print(f"Loading documents from {file_path}")
        ddf = dd.read_csv(file_path)
        ddf = ddf.dropna(subset=['pmid', 'title', 'abstract'])  # Drop rows with missing values

        # Combine title and abstract into a single text column
        ddf['text'] = ddf['title'] + " " + ddf['abstract']
        return ddf

    def encode_batch_text(self, texts):
        """
        Encodes a batch of texts into embeddings using the Sentence-BERT model.
        """
        embeddings = self.embedding_model.encode(texts, convert_to_tensor=True, batch_size=32)
        return embeddings.cpu()  # Ensure embeddings are moved to CPU

    def build_faiss_index(self, embeddings):
        """
        Builds a FAISS index for approximate nearest neighbor search.
        """
        embeddings_np = np.array(embeddings)  # Convert embeddings to NumPy array
        d = embeddings_np.shape[1]  # Dimension of embeddings
        index = faiss.IndexFlatL2(d)
        index.add(embeddings_np)  # Add embeddings to the index
        return index

    def search_faiss_index(self, index, query_embedding, top_k=10):
        """
        Searches the FAISS index for the top-k most similar embeddings.
        """
        query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1)  # Move query embedding to CPU
        distances, indices = index.search(query_embedding_np, top_k)
        return distances[0], indices[0]

    def filter_documents(self, query, documents, embeddings, top_n_docs=10):
        """
        Filters documents using approximate semantic similarity.
        """
        query_embedding = self.encode_batch_text([query])
        index = self.build_faiss_index(embeddings)
        distances, indices = self.search_faiss_index(index, query_embedding, top_n_docs)
        
        top_documents = [
            {
                "pmc_id": documents.iloc[i]["pmid"],
                "text": documents.iloc[i]["text"],
                "score": 1 / (1 + distances[idx]),  # Inverse scaling to normalize distance
                "url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{documents.iloc[i]['pmid']}/",
            }
            for idx, i in enumerate(indices)
        ]
        return top_documents, query_embedding

    def rank_snippets(self, query_embedding, top_documents, top_n_snippets=10):
        """
        Ranks snippets globally based on semantic similarity.
        """
        snippets = []
        for doc in top_documents:
            sentences = sent_tokenize(doc["text"])  # Split the document into sentences
            snippet_embeddings = self.encode_batch_text(sentences)
            snippet_scores = cosine_similarity(query_embedding.cpu().numpy(), snippet_embeddings.cpu().numpy())[0]
            
            for i, sentence in enumerate(sentences):
                snippets.append({
                    "text": sentence,
                    "source": doc["pmc_id"],
                    "score": snippet_scores[i],
                })

        # Sort snippets globally by score
        top_snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:top_n_snippets]
        return top_snippets

    def get_relevant_documents_and_snippets(self, query, file_path, top_n_docs=10, top_n_snippets=10):
        """
        Retrieves top N most relevant documents and globally ranks snippets from them.
        """
        # Load documents with Dask
        ddf = self.preprocess_documents(file_path).compute()  # Convert Dask DataFrame to pandas DataFrame

        if len(ddf) == 0:
            print("No valid documents found.")
            return [], []

        # Encode document embeddings in batches
        print("Encoding document embeddings...")
        embeddings = np.array([embedding.cpu().numpy() for embedding in self.encode_batch_text(ddf['text'].tolist())])

        # Filter top documents using semantic similarity
        print("Filtering top documents...")
        top_documents, query_embedding = self.filter_documents(query, ddf, embeddings, top_n_docs=top_n_docs)

        # Rank snippets globally
        print("Ranking snippets globally...")
        top_snippets = self.rank_snippets(query_embedding, top_documents, top_n_snippets=top_n_snippets)

        return top_documents, top_snippets

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/onurcanmemis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
# Initialize the model
model = SemanticRetrievalModelWithDaskOptimized()

# Define the query and CSV file path
query = "Multiple sclerosis treatments using interferon"
file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Retrieve top documents and snippets
top_documents, top_snippets = model.get_relevant_documents_and_snippets(
    query=query,
    file_path=file_path,
    top_n_docs=10,
    top_n_snippets=10
)

# Print the top 10 documents
print("\nTop 10 Documents:")
for doc in top_documents:
    print(f"PMC ID: {doc['pmc_id']}, URL: {doc['url']}, Score: {doc['score']:.4f}")

# Print the top 10 snippets
print("\nTop 10 Snippets:")
for snippet in top_snippets:
    print(f"Source: https://www.ncbi.nlm.nih.gov/pmc/articles/{snippet['source']}/")
    print(f"Snippet: {snippet['text']}")
    print(f"Score: {snippet['score']:.4f}\n")

Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Filtering top documents...
Ranking snippets globally...

Top 10 Documents:
PMC ID: 1295, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1295/, Score: 0.0398
PMC ID: 2143, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2143/, Score: 0.0397
PMC ID: 1628, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1628/, Score: 0.0389
PMC ID: 1990, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1990/, Score: 0.0379
PMC ID: 1996, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1996/, Score: 0.0379
PMC ID: 725, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC725/, Score: 0.0373
PMC ID: 1801, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1801/, Score: 0.0371
PMC ID: 2396, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2396/, Score: 0.0371
PMC ID: 1296, URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1296/, Score: 0.0364
PMC

## Time Commplexity

In [63]:
import time

# Path to the CSV file
csv_file_path = "/Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv"

# Query to use for comparison
query = "Multiple sclerosis treatments using interferon"

# Number of documents and snippets to retrieve
top_n_docs = 10
top_n_snippets = 10

# List of models to compare
models = [
    {
        "name": "SemanticRetrievalModelWithDaskOptimized",
        "class": SemanticRetrievalModelWithDaskOptimized,
    },
    {
        "name": "SemanticRetrievalModelWithDask",
        "class": SemanticRetrievalModelWithDask,
    },
    {
        "name": "BM25ModelWithDask",
        "class": BM25ModelWithDask,
    },
    {
        "name": "QuerySpecificTFIDFModel",
        "class": QuerySpecificTFIDFModel,
    },
]

# Function to measure execution time of a model
def measure_execution_time(model_class, csv_file_path, query, top_n_docs, top_n_snippets):
    model = model_class()  # Instantiate the model
    start_time = time.time()
    model.get_relevant_documents_and_snippets(
        query, csv_file_path, top_n_docs=top_n_docs, top_n_snippets=top_n_snippets
    )
    end_time = time.time()
    elapsed_time = end_time - start_time
    return elapsed_time

# Compare the models
results = []
for model_info in models:
    model_name = model_info["name"]
    model_class = model_info["class"]
    print(f"Evaluating {model_name}...")
    elapsed_time = measure_execution_time(
        model_class, csv_file_path, query, top_n_docs, top_n_snippets
    )
    results.append({"Model": model_name, "Time (seconds)": elapsed_time})

# Print results
print("\nComparison Results:")
for result in results:
    print(f"Model: {result['Model']}, Time Taken: {result['Time (seconds)']:.2f} seconds")

Evaluating SemanticRetrievalModelWithDaskOptimized...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Filtering top documents...
Ranking snippets globally...
Evaluating SemanticRetrievalModelWithDask...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Encoding document embeddings...
Filtering top documents...
Ranking snippets globally...
Evaluating BM25ModelWithDask...
Loading documents from /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv
Evaluating QuerySpecificTFIDFModel...
Loading CSV file: /Users/onurcanmemis/Desktop/Lectures/Information_Retrieval/Project/ir-bioasq/main_articles_head1000.csv

Comparison Results:
Model: SemanticRetrievalModelWithDaskOptimized, Time Taken: 4.01 seconds
Model: SemanticRetrievalModelWithDask, Time Taken: 4.01 se