In [10]:
import os
import json
import fitz  # PyMuPDF for image extraction
import pytesseract
import pandas as pd
from PIL import Image
from pathlib import Path

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


RAW_DATA_DIR = "/content"
OUTPUT_DIR = "/content/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)


# --------------------------
# IMAGE EXTRACTION + OCR
# --------------------------

def extract_images_with_ocr(pdf_path):
    """Extract images from PDF using PyMuPDF + OCR."""
    pdf = fitz.open(pdf_path)
    results = []

    for page_num in range(len(pdf)):
        page = pdf[page_num]
        images = page.get_images(full=True)

        for img_idx, img in enumerate(images):
            xref = img[0]
            base = pdf.extract_image(xref)
            image_bytes = base["image"]

            img_name = f"{Path(pdf_path).stem}_p{page_num+1}_img{img_idx+1}.png"
            out_path = os.path.join(OUTPUT_DIR, img_name)

            with open(out_path, "wb") as f:
                f.write(image_bytes)

            pil_img = Image.open(out_path)
            ocr_text = pytesseract.image_to_string(pil_img)
            ocr_text = ocr_text.strip()

            results.append({
                "type": "image",
                "page": page_num + 1,
                "image_path": out_path,
                "ocr_text": ocr_text,
                "source_pdf": pdf_path
            })

    return results



# --------------------------
# TEXT LOADING (LANGCHAIN)
# --------------------------

def load_text_with_langchain(pdf_path):
    """Load PDF text using LangChain PyMuPDFLoader."""
    try:
        loader = PyMuPDFLoader(pdf_path)
        docs = loader.load()
    except:
        from langchain_community.document_loaders import PyPDFLoader
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()

    return docs



# --------------------------
# CHUNKING
# --------------------------

def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return text_splitter.split_documents(docs)



# --------------------------
# INGEST PIPELINE
# --------------------------

def ingest_pdf(pdf_path):
    print(f"\nProcessing PDF: {pdf_path}")

    # Load text
    docs = load_text_with_langchain(pdf_path)

    # Chunk text
    text_chunks = chunk_documents(docs)

    processed = [
        {
            "type": "text",
            "page": doc.metadata.get("page", None),
            "content": doc.page_content,
            "source_pdf": pdf_path
        }
        for doc in text_chunks
    ]

    # Extract images + OCR
    image_chunks = extract_images_with_ocr(pdf_path)
    processed.extend(image_chunks)

    return processed



# --------------------------
# MAIN EXECUTION
# --------------------------

def main():
    all_docs = []

    pdf_files = [
        os.path.join(RAW_DATA_DIR, f)
        for f in os.listdir(RAW_DATA_DIR)
        if f.lower().endswith(".pdf")
    ]

    print(f"Found {len(pdf_files)} PDFs in /content")

    for pdf in pdf_files:
        all_docs.extend(ingest_pdf(pdf))

    out_path = os.path.join(OUTPUT_DIR, "processed_chunks.json")
    with open(out_path, "w") as f:
        json.dump(all_docs, f, indent=4)

    print(f"\nSaved processed chunks → {out_path}")
    print(f"Total Chunks: {len(all_docs)}")


main()


Found 1 PDFs in /content

Processing PDF: /content/qatar_test_doc.pdf

Saved processed chunks → /content/processed/processed_chunks.json
Total Chunks: 505


In [11]:
# Information Retrieval evaluation script
# Uses the same models, metrics and evaluation manner as provided by the user.
# Assumes you have preprocessed/chunked JSON at: /content/processed/processed_chunks.json
# (This local path will be used as the file URL in downstream tooling.)
#
# Copy into Google Colab and run. It installs necessary libs, loads the chunked data,
# constructs documents + page-based multi-labels, computes embeddings with many models,
# evaluates IR metrics (MRR, NDCG, Precision/Recall/F1 and @5), and saves results.

# -----------------------
# Install dependencies
# -----------------------
!pip install -q sentence-transformers gensim tensorflow_hub scikit-learn pandas numpy tqdm
!pip install -q "tensorflow>=2.9.0"   # ensure TF for TF-Hub
!apt-get install -y -qq libsndfile1   # sometimes required for TF
# (If running on Colab, you may already have many packages installed.)

# -----------------------
# Script
# -----------------------
import os
import json
import shutil
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# gensim (word vectors)
import gensim.downloader as api

# sentence-transformers
from sentence_transformers import SentenceTransformer

# tensorflow hub
import tensorflow_hub as hub

# torch for device handling
import torch

# -----------------------
# Configuration: local file path (this is the local path/url per dev instruction)
# -----------------------
# Path to the processed chunks JSON produced by the ingestion pipeline.
# (Tooling that converts local path -> URL will use this.)
PROCESSED_JSON_PATH = "/content/processed/processed_chunks.json"

# -----------------------
# Helper utilities
# -----------------------
def clear_tfhub_cache():
    """Clear TensorFlow Hub cache to force fresh download (useful if hub errors)."""
    cache_dir = os.path.join(os.environ.get('TMP', '/tmp'), 'tfhub_modules')
    if os.path.exists(cache_dir):
        try:
            shutil.rmtree(cache_dir)
            print(f"Cleared TensorFlow Hub cache at: {cache_dir}")
        except Exception as e:
            print(f"Could not clear TF-Hub cache: {e}")

# -----------------------
# Load preprocessed chunks
# -----------------------
if not os.path.exists(PROCESSED_JSON_PATH):
    raise FileNotFoundError(f"Processed JSON not found at: {PROCESSED_JSON_PATH}\n"
                            "Make sure ingestion step saved processed_chunks.json at this path.")

with open(PROCESSED_JSON_PATH, "r", encoding="utf-8") as fh:
    processed_chunks = json.load(fh)

# Build documents (text) list from processed chunks
# We'll prefer 'content' field; if missing, fallback to OCR or table_text.
documents = []
pages = []  # page number for each chunk (may be None)
sources = []  # source pdf path per chunk
for rec in processed_chunks:
    # Expect records with keys: type, page, content, image_path (optional), source_pdf
    content = rec.get("content", "")
    if (not content or str(content).strip() == "") and rec.get("type") == "image":
        # if OCR available
        content = rec.get("ocr_text", "") or rec.get("content", "")
    content = str(content).strip()
    if not content:
        # skip empty chunks
        continue
    documents.append(content)
    pages.append(rec.get("page", None))
    sources.append(rec.get("source_pdf", None))

n_docs = len(documents)
print(f"Loaded {n_docs} document chunks from: {PROCESSED_JSON_PATH}")

# -----------------------
# Build multi-label matrix using page numbers as labels
# Rationale: documents sharing same page are treated as relevant to each other.
# This provides a multi-label relevance signal consistent with "share ANY label".
# -----------------------
unique_pages = sorted(list({p for p in pages if p is not None}))
if len(unique_pages) == 0:
    # fallback: create single label for all documents (not ideal)
    unique_pages = [0]

page_to_label_idx = {p: i for i, p in enumerate(unique_pages)}
num_labels = len(unique_pages)
labels = np.zeros((n_docs, num_labels), dtype=int)
for i, p in enumerate(pages):
    if p is None:
        # leave zero vector
        continue
    labels[i, page_to_label_idx[p]] = 1

print(f"Created label matrix: documents={n_docs}, distinct_page_labels={num_labels}")

# -----------------------
# Models & Embedding preparation (following the manner from user's code)
# -----------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}\n")

# Clear TF-Hub cache before loading USE if necessary
clear_tfhub_cache()

# Initialize classical / text models
tfidf_1gram = TfidfVectorizer(ngram_range=(1, 1))
tfidf_2gram = TfidfVectorizer(ngram_range=(1, 2))

# Try to load word vector models via gensim (may take time / not available)
word2vec = None
glove = None
fasttext = None
try:
    print("Attempting to load Word2Vec (Google News 300)... (may be large)")
    word2vec = api.load('word2vec-google-news-300')
    print("Word2Vec loaded.")
except Exception as e:
    print(f"Word2Vec not available: {e}")

try:
    print("Attempting to load GloVe (wiki-gigaword-300)...")
    glove = api.load('glove-wiki-gigaword-300')
    print("GloVe loaded.")
except Exception as e:
    print(f"GloVe not available: {e}")

try:
    print("Attempting to load FastText (wiki-news-subwords-300)...")
    fasttext = api.load('fasttext-wiki-news-subwords-300')
    print("FastText loaded.")
except Exception as e:
    print(f"FastText not available: {e}")

# Transformer / sentence models
print("\nLoading Sentence-Transformers models (this may take memory/time)...")
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2', device=str(device))
roberta_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1', device=str(device))
labse_model = SentenceTransformer('sentence-transformers/LaBSE', device=str(device))
mpnet_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=str(device))
print("Transformer models loaded (SBERT, RoBERTa, LaBSE, MPNet).")

# Universal Sentence Encoder (TF-Hub)
print("\nLoading Universal Sentence Encoder (TF-Hub)...")
try:
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print("USE loaded successfully.")
except Exception as e:
    print(f"USE failed to load on first try: {e}\nAttempting to clear cache and retry.")
    clear_tfhub_cache()
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print("USE loaded after retry.")

# -----------------------
# Helper functions from user's manner
# -----------------------
def get_word_embeddings(model, texts):
    """Generate averaged word embeddings for each text using the provided keyed vector model."""
    embeddings = []
    for text in texts:
        words = text.split()
        word_vecs = []
        for word in words:
            try:
                if word.lower() in model:
                    word_vecs.append(model[word.lower()])
            except Exception:
                continue
        if word_vecs:
            embeddings.append(np.mean(word_vecs, axis=0))
        else:
            # vector_size attribute present on gensim KeyedVectors
            vec_size = getattr(model, 'vector_size', None)
            if vec_size is None:
                # fallback dimension 300
                vec_size = 300
            embeddings.append(np.zeros(vec_size, dtype=float))
    return np.array(embeddings)

# metric functions (copied/adapted from user's code)
def calculate_metrics(query_idx, similarities, labels, k=5):
    top_k_indices = np.argsort(similarities[query_idx])[::-1][1:k+1]
    query_labels = labels[query_idx]
    query_active_labels = np.where(query_labels == 1)[0]
    if len(query_active_labels) == 0:
        return {
            'mrr': 0.0,
            'ndcg': 0.0,
            'ndcg_at_k': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'precision_at_k': 0.0,
            'recall_at_k': 0.0,
            'f1_at_k': 0.0
        }
    relevant_doc_indices = set()
    for label_idx in query_active_labels:
        relevant_docs = np.where(labels[:, label_idx] == 1)[0]
        relevant_doc_indices.update(relevant_docs)
    relevant_doc_indices.discard(query_idx)
    relevant_doc_indices = np.array(list(relevant_doc_indices))
    if len(relevant_doc_indices) == 0:
        return {
            'mrr': 0.0,
            'ndcg': 0.0,
            'ndcg_at_k': 0.0,
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'precision_at_k': 0.0,
            'recall_at_k': 0.0,
            'f1_at_k': 0.0
        }
    all_retrieved_indices = np.argsort(similarities[query_idx])[::-1][1:]
    retrieved_relevant_at_k = len(set(top_k_indices) & set(relevant_doc_indices))
    precision_at_k = retrieved_relevant_at_k / k
    recall_at_k = retrieved_relevant_at_k / len(relevant_doc_indices)
    f1_at_k = 2 * precision_at_k * recall_at_k / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0.0
    num_retrieved = min(len(all_retrieved_indices), len(relevant_doc_indices) * 2)
    retrieved_docs = all_retrieved_indices[:num_retrieved]
    retrieved_relevant = len(set(retrieved_docs) & set(relevant_doc_indices))
    precision = retrieved_relevant / num_retrieved if num_retrieved > 0 else 0.0
    recall = retrieved_relevant / len(relevant_doc_indices)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    mrr = 0.0
    for rank, doc_idx in enumerate(top_k_indices, 1):
        if doc_idx in relevant_doc_indices:
            mrr = 1.0 / rank
            break
    # NDCG@K
    dcg_at_k = 0.0
    for rank, doc_idx in enumerate(top_k_indices, 1):
        relevance = np.sum(labels[doc_idx] * query_labels)
        dcg_at_k += relevance / np.log2(rank + 1)
    all_relevances_excluding_query = []
    for doc_idx in range(len(labels)):
        if doc_idx != query_idx:
            relevance = np.sum(labels[doc_idx] * query_labels)
            all_relevances_excluding_query.append(relevance)
    ideal_relevances_at_k = sorted(all_relevances_excluding_query, reverse=True)[:k]
    idcg_at_k = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(ideal_relevances_at_k, 1))
    ndcg_at_k = dcg_at_k / idcg_at_k if idcg_at_k > 0 else 0.0
    cutoff_overall = min(len(all_retrieved_indices), 100)
    dcg_overall = 0.0
    for rank, doc_idx in enumerate(all_retrieved_indices[:cutoff_overall], 1):
        relevance = np.sum(labels[doc_idx] * query_labels)
        dcg_overall += relevance / np.log2(rank + 1)
    ideal_relevances_overall = sorted(all_relevances_excluding_query, reverse=True)[:cutoff_overall]
    idcg_overall = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(ideal_relevances_overall, 1))
    ndcg_overall = dcg_overall / idcg_overall if idcg_overall > 0 else 0.0
    return {
        'mrr': mrr,
        'ndcg': ndcg_overall,
        'ndcg_at_k': ndcg_at_k,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'precision_at_k': precision_at_k,
        'recall_at_k': recall_at_k,
        'f1_at_k': f1_at_k
    }

def evaluate_model(embeddings, model_name, labels, k=5):
    """Evaluate embeddings using IR metrics"""
    if isinstance(embeddings, np.ndarray):
        similarities = cosine_similarity(embeddings)
    else:
        # sparse matrix (TF-IDF)
        similarities = cosine_similarity(embeddings.toarray())
    all_metrics = []
    for query_idx in range(len(labels)):
        metrics = calculate_metrics(query_idx, similarities, labels, k)
        all_metrics.append(metrics)
    avg_metrics = {
        'model': model_name,
        'MRR': np.mean([m['mrr'] for m in all_metrics]),
        'NDCG': np.mean([m['ndcg'] for m in all_metrics]),
        'NDCG@5': np.mean([m['ndcg_at_k'] for m in all_metrics]),
        'Precision': np.mean([m['precision'] for m in all_metrics]),
        'Recall': np.mean([m['recall'] for m in all_metrics]),
        'F1': np.mean([m['f1'] for m in all_metrics]),
        'P@5': np.mean([m['precision_at_k'] for m in all_metrics]),
        'R@5': np.mean([m['recall_at_k'] for m in all_metrics]),
        'F1@5': np.mean([m['f1_at_k'] for m in all_metrics])
    }
    return avg_metrics

# -----------------------
# Run evaluations (same order and models as user's snippet)
# -----------------------
results = []
print("\n" + "="*80)
print("EVALUATING MODELS")
print("="*80)

# TF-IDF 1-gram
print("\nEvaluating TF-IDF (1-gram)...")
tfidf_1gram_emb = tfidf_1gram.fit_transform(documents)
results.append(evaluate_model(tfidf_1gram_emb, "TF-IDF (1-gram)", labels))

# TF-IDF 2-gram
print("Evaluating TF-IDF (2-gram)...")
tfidf_2gram_emb = tfidf_2gram.fit_transform(documents)
results.append(evaluate_model(tfidf_2gram_emb, "TF-IDF (2-gram)", labels))

# Word2Vec
if word2vec is not None:
    print("Evaluating Word2Vec...")
    w2v_emb = get_word_embeddings(word2vec, documents)
    results.append(evaluate_model(w2v_emb, "Word2Vec", labels))
    print("Evaluating TF-IDF + Word2Vec...")
    combined_emb = np.hstack([tfidf_1gram_emb.toarray(), w2v_emb])
    results.append(evaluate_model(combined_emb, "TF-IDF + Word2Vec", labels))
else:
    print("Skipping Word2Vec (not available).")

# GloVe
if glove is not None:
    print("Evaluating GloVe...")
    glove_emb = get_word_embeddings(glove, documents)
    results.append(evaluate_model(glove_emb, "GloVe", labels))
    print("Evaluating TF-IDF + GloVe...")
    combined_emb = np.hstack([tfidf_1gram_emb.toarray(), glove_emb])
    results.append(evaluate_model(combined_emb, "TF-IDF + GloVe", labels))
else:
    print("Skipping GloVe (not available).")

# FastText
if fasttext is not None:
    print("Evaluating FastText...")
    fasttext_emb = get_word_embeddings(fasttext, documents)
    results.append(evaluate_model(fasttext_emb, "FastText", labels))
    print("Evaluating TF-IDF + FastText...")
    combined_emb = np.hstack([tfidf_1gram_emb.toarray(), fasttext_emb])
    results.append(evaluate_model(combined_emb, "TF-IDF + FastText", labels))
else:
    print("Skipping FastText (not available).")

# Transformers: Sentence-BERT
print("Evaluating Sentence-BERT...")
sbert_embeddings = sbert_model.encode(documents, convert_to_numpy=True, show_progress_bar=True)
results.append(evaluate_model(sbert_embeddings, "Sentence-BERT", labels))

# Sentence-RoBERTa
print("Evaluating Sentence-RoBERTa...")
roberta_embeddings = roberta_model.encode(documents, convert_to_numpy=True, show_progress_bar=True)
results.append(evaluate_model(roberta_embeddings, "Sentence-RoBERTa", labels))

# LaBSE
print("Evaluating LaBSE...")
labse_embeddings = labse_model.encode(documents, convert_to_numpy=True, show_progress_bar=True)
results.append(evaluate_model(labse_embeddings, "LaBSE", labels))

# USE
print("Evaluating USE...")
use_embeddings = np.array(use_model(documents))
results.append(evaluate_model(use_embeddings, "USE", labels))

# MPNet
print("Evaluating MPNet...")
mpnet_embeddings = mpnet_model.encode(documents, convert_to_numpy=True, show_progress_bar=True)
results.append(evaluate_model(mpnet_embeddings, "MPNet", labels))

# Free GPU memory if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# -----------------------
# Display & Save results
# -----------------------
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("INFORMATION RETRIEVAL EVALUATION RESULTS")
print("="*80)
print(results_df.round(4))
results_df.to_csv('/content/ir_evaluation_results.csv', index=False)
print("\nResults saved to: /content/ir_evaluation_results.csv")

# Top performers by metric
metrics_to_rank = ['MRR', 'NDCG', 'NDCG@5', 'Precision', 'Recall', 'F1']
for metric in metrics_to_rank:
    print("\n" + "="*40)
    print(f"TOP MODELS BY {metric}")
    print("="*40)
    try:
        top_models = results_df.nlargest(5, metric)[['model', metric]]
        print(top_models.to_string(index=False))
    except Exception as e:
        print(f"Could not rank by {metric}: {e}")

print("\nDone.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hLoaded 497 document chunks from: /content/processed/processed_chunks.json
Created label matrix: documents=497, distinct_page_labels=78

Using device: cuda

Attempting to load Word2Vec (Google News 300)... (may be large)
Word2Vec loaded.
Attempting to load GloVe (wiki-gigaword-300)...
GloVe loaded.
Attempting to load FastText (wiki-news-subwords-300)...

Loading Sentence-Transformers models (this may take memory/time)...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Transformer models loaded (SBERT, RoBERTa, LaBSE, MPNet).

Loading Universal Sentence Encoder (TF-Hub)...
USE loaded successfully.

EVALUATING MODELS

Evaluating TF-IDF (1-gram)...
Evaluating TF-IDF (2-gram)...
Evaluating Word2Vec...
Evaluating TF-IDF + Word2Vec...
Evaluating GloVe...
Evaluating TF-IDF + GloVe...
Skipping FastText (not available).
Evaluating Sentence-BERT...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Evaluating Sentence-RoBERTa...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Evaluating LaBSE...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Evaluating USE...
Evaluating MPNet...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]


INFORMATION RETRIEVAL EVALUATION RESULTS
                model     MRR    NDCG  NDCG@5  Precision  Recall      F1  \
0     TF-IDF (1-gram)  0.5021  0.4534  0.2969     0.1679  0.3358  0.2239   
1     TF-IDF (2-gram)  0.4994  0.4524  0.2984     0.1706  0.3413  0.2275   
2            Word2Vec  0.4055  0.3770  0.2311     0.1303  0.2606  0.1737   
3   TF-IDF + Word2Vec  0.5024  0.4544  0.2982     0.1743  0.3486  0.2324   
4               GloVe  0.3541  0.3289  0.1936     0.1062  0.2124  0.1416   
5      TF-IDF + GloVe  0.4079  0.3597  0.2252     0.1201  0.2401  0.1601   
6       Sentence-BERT  0.3994  0.3632  0.2272     0.1260  0.2520  0.1680   
7    Sentence-RoBERTa  0.4023  0.3722  0.2265     0.1307  0.2614  0.1743   
8               LaBSE  0.4155  0.3970  0.2462     0.1401  0.2803  0.1868   
9                 USE  0.3828  0.3472  0.2147     0.1156  0.2312  0.1541   
10              MPNet  0.4387  0.3724  0.2440     0.1335  0.2671  0.1780   

       P@5     R@5    F1@5  
0   0.2443  0.20