In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Custom tokenizer instead of lambda
def whitespace_tokenizer(text):
    return text.split()

# Load preprocessed clinical trials
df = pd.read_csv("clinical_trials_cleaned.csv")

# Join tokens back into strings
df['joined'] = df['tokens'].apply(eval).apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    lowercase=False,
    tokenizer=whitespace_tokenizer,
    preprocessor=None,
    token_pattern=None,
    max_df=0.85,
    min_df=5,
)

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(df['joined'])

# Save vectorizer, matrix, and metadata
joblib.dump(vectorizer, "clinical_trials_vectorizer.pkl")
joblib.dump(tfidf_matrix, "clinical_trials_tfidf_matrix.pkl")
df[['doc_id', 'original_text']].to_csv("clinical_trials_metadata.csv", index=False)

print("✅ TF-IDF index built and saved.")

✅ TF-IDF index built and saved.


In [9]:
# ✅ build_bm25_clinical_trials.py
import pandas as pd
import joblib
from rank_bm25 import BM25Okapi

# Load cleaned clinical trials data
df = pd.read_csv("clinical_trials_cleaned.csv")

# Prepare corpus: list of token lists
corpus = df['tokens'].apply(eval).tolist()

# Build BM25 index
bm25 = BM25Okapi(corpus)

# Save index and metadata
joblib.dump(bm25, "clinical_trials_bm25_model.pkl")
joblib.dump(corpus, "clinical_trials_bm25_corpus.pkl")
df[['doc_id', 'original_text']].to_csv("clinical_trials_metadata.csv", index=False)

print("✅ BM25 index for Clinical Trials built and saved.")

✅ BM25 index for Clinical Trials built and saved.


In [10]:
import pandas as pd
import joblib
from rank_bm25 import BM25Okapi
import textwrap

# Load data
with open("clinical_trials_bm25_corpus.pkl", "rb") as f:
    corpus = joblib.load(f)
with open("clinical_trials_bm25_model.pkl", "rb") as f:
    bm25 = joblib.load(f)
df_meta = pd.read_csv("clinical_trials_metadata.csv")

# Helper to show results
def run_query(query, top_k=5):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    results = []
    for i in top_idx:
        doc_id = df_meta.iloc[i]['doc_id']
        text = df_meta.iloc[i]['original_text']
        results.append((doc_id, scores[i], text))
    return results

# Define test queries
queries = {
    "TREATMENT SAFETY": [
        "What are the side effects of the vaccine?",
        "Is the treatment well tolerated by patients?",
        "Any serious adverse events reported during the trial?"
    ],
    "PATIENT ELIGIBILITY": [
        "Who can participate in the trial?",
        "Eligibility criteria for patients with lung cancer",
        "Am I eligible for this study?"
    ],
    "TRIAL DESIGN": [
        "How is the study structured?",
        "What is the primary endpoint of the trial?",
        "How long does the clinical trial last?"
    ]
}

# Run and print results
for section, qs in queries.items():
    print("="*30)
    print(f"🧪 {section}")
    print("="*30)
    for query in qs:
        print(f"\n🔍 Query: {query}\n")
        results = run_query(query)
        for doc_id, score, text in results:
            print(f"🔹 Doc ID: {doc_id}  (Score: {score:.4f})")
            print(textwrap.fill(text, width=100))
            print()fana

🧪 TREATMENT SAFETY

🔍 Query: What are the side effects of the vaccine?

🔹 Doc ID: NCT00359671  (Score: 18.1472)
Treatment With MK6592 and an Anti-cancer Drug in Patients With Advanced Solid Tumors (6592-001)
A study to evaluate safety and tolerability of MK6592 in combination with an anti-cancer       drug
in adult patients with advanced solid tumors.                          Inclusion Criteria:
-  Patients with advanced solid tumors (metastatic or local) unresponsive to standard
therapy, progressed on standard therapy, or no standard therapy exists. No limit to              the
number of prior treatment regimens            -  Patients may be fully active without physical
restrictions, ambulatory with              restrictions on strenuous physical activity, or
ambulatory and capable of self-care              but not work activities (i.e., Eastern Cooperative
Oncology Group performance status              of greater than or equal to 2)            -
Demonstrates adequate organ function 

In [14]:
# build_bert_embeddings_clinical.py

import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

# Load preprocessed data
df = pd.read_csv("clinical_trials_cleaned.csv")
df['joined'] = df['tokens'].apply(eval).apply(lambda tokens: ' '.join(tokens))

# Load a compact, efficient BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")  # ~384D embeddings

# Encode documents in batches
batch_size = 256
all_embeddings = []
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['joined'].iloc[i:i+batch_size].tolist()
    embeddings = model.encode(batch_texts, show_progress_bar=False, convert_to_numpy=True)
    all_embeddings.append(embeddings)

# Stack all batches into one array
all_embeddings = np.vstack(all_embeddings)

# Save embeddings and doc IDs
joblib.dump(all_embeddings, "clinical_trials_bert_embeddings.pkl")
df[['doc_id', 'original_text']].to_csv("clinical_trials_metadata.csv", index=False)

print("✅ BERT embeddings for Clinical Trials built and saved.")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 942/942 [3:59:35<00:00, 15.26s/it]


✅ BERT embeddings for Clinical Trials built and saved.


In [15]:
import pandas as pd
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import textwrap

# Load precomputed embeddings and metadata
doc_embeddings = joblib.load("clinical_trials_bert_embeddings.pkl")
df_meta = pd.read_csv("clinical_trials_metadata.csv")

# Load the same BERT model used during building
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example queries
queries = [
    "What are the side effects of the vaccine?",
    "Who can participate in the trial?",
    "What is the design of the clinical trial?"
]

# Number of results to retrieve per query
TOP_K = 5

for query in queries:
    print("=" * 60)
    print(f"🔍 Query: {query}\n")

    # Encode query
    query_embedding = model.encode([query])

    # Compute cosine similarities
    scores = cosine_similarity(query_embedding, doc_embeddings)[0]

    # Get top-k indices
    top_k_idx = np.argsort(scores)[::-1][:TOP_K]

    for idx in top_k_idx:
        doc_id = df_meta.iloc[idx]['doc_id']
        text = df_meta.iloc[idx]['original_text']
        score = scores[idx]
        print(f"🔹 Doc ID: {doc_id}  (Score: {score:.4f})\n{textwrap.fill(text, width=100)}\n")

🔍 Query: What are the side effects of the vaccine?

🔹 Doc ID: NCT02117570  (Score: 0.4750)
A Study To Investigate A Clostridium Difficile Vaccine In Healthy Adults Aged 50 to 85 Years, Who
Will Each Receive 3 Doses Of Vaccine.             This study will investigate a Clostridium
difficile vaccine in healthy adults aged 50 to 85       years, who will each receive 3 doses of
vaccine. The study will assess the safety and       tolerability of the vaccine, and also look at
the subjects' immune response to the vaccine.                          Inclusion Criteria:
Healthy male and female subjects aged 50 to 85 years          Exclusion Criteria:          Proven or
suspected prior episode of Clostridium difficile associated diarrhea.          Unstable chronic
medical condition or disease requiring significant change in therapy or         hospitalization for
worsening disease within 8 weeks before receipt of study vaccine.          Any contraindication to
vaccination or vaccine components, inc

In [2]:
import pandas as pd
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import textwrap

# Load preprocessed corpus (tokenized)
with open("clinical_trials_bm25_corpus.pkl", "rb") as f:
    corpus = joblib.load(f)

# Load BM25 model
with open("clinical_trials_bm25_model.pkl", "rb") as f:
    bm25 = joblib.load(f)

# Load original texts
df_meta = pd.read_csv("clinical_trials_metadata.csv")
doc_id_to_text = dict(zip(df_meta['doc_id'], df_meta['original_text']))

# Load BERT model and document embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = joblib.load("clinical_trials_bert_embeddings.pkl")

# Map from doc_id to index (to locate BERT embeddings)
doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(df_meta['doc_id'])}

# Example queries
queries = [
    "What are the side effects of the vaccine?",
    "Who can participate in the trial?",
    "What is the trial design for this cancer study?",
]

# Parameters
bm25_top_k = 100   # How many BM25 docs to consider
final_top_k = 5    # Final results to show after BERT reranking

for query in queries:
    print("=" * 40)
    print(f"🔍 Query: {query}\n")

    # Step 1: BM25 retrieval
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(bm25_scores)[::-1][:bm25_top_k]
    top_doc_ids = [df_meta.iloc[idx]['doc_id'] for idx in top_indices]

    # Step 2: BERT rerank on top BM25 docs
    top_texts = [doc_id_to_text[doc_id] for doc_id in top_doc_ids]
    top_indices_for_bert = [doc_id_to_index[doc_id] for doc_id in top_doc_ids]
    candidate_embeddings = doc_embeddings[top_indices_for_bert]

    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, candidate_embeddings)[0].cpu().numpy()

    reranked = sorted(zip(top_doc_ids, cos_scores), key=lambda x: x[1], reverse=True)[:final_top_k]

    for doc_id, score in reranked:
        wrapped_text = textwrap.fill(doc_id_to_text[doc_id], 100)
        print(f"🔹 Doc ID: {doc_id}  (Score: {score:.4f})\n{wrapped_text}\n")

print("✅ Hybrid BM25 + BERT reranking done.")

🔍 Query: What are the side effects of the vaccine?

🔹 Doc ID: NCT02996201  (Score: 0.2683)
Electronic Patient Reporting of Side Effects to Chemotherapy: A Cluster Randomized Controlled Trial
The aim of this study is to determine whether the use of breast cancer patients' own
electronic reporting of side effects to chemotherapy in a treatment setting has an impact on
the handling of side effects and on the number of hospitalizations, febrile neutropenia and
dose adjustments. We are using the Patient-Reported Outcomes version of the Common       Terminology
Criteria for Adverse Events (PRO-CTCAE) for the patients' reporting of side       effects.
Gender •Female: only female participants are being studied          Minimum age          •18 years
Maximum age •N/A          Accepts Healthy Volunteers          •No          Eligibility Criteria
Inclusion criteria            -  Breast cancer patients starting adjuvant chemotherapy in the period
November 1, 2015              - September 1, 2016 i

In [3]:
import pandas as pd
import json
from collections import defaultdict

# Load preprocessed dataset
df = pd.read_csv("clinical_trials_cleaned.csv")
df['tokens'] = df['tokens'].apply(eval)

# Build inverted index
inverted_index = defaultdict(set)

for idx, row in df.iterrows():
    doc_id = row['doc_id']
    tokens = row['tokens']
    for token in set(tokens):  # Use set to avoid duplicates in the same doc
        inverted_index[token].add(doc_id)

# Convert sets to sorted lists for JSON serialization
inverted_index = {term: sorted(list(doc_ids)) for term, doc_ids in inverted_index.items()}

# Save inverted index
with open("clinical_trials_inverted_index.json", "w", encoding="utf-8") as f:
    json.dump(inverted_index, f, indent=2)

print("✅ Inverted index for Clinical Trials built and saved.")

✅ Inverted index for Clinical Trials built and saved.


In [4]:
import json
import pandas as pd
from collections import Counter
import textwrap

# Load inverted index and metadata
with open("clinical_trials_inverted_index.json", "r", encoding="utf-8") as f:
    inverted_index = json.load(f)

df_meta = pd.read_csv("clinical_trials_metadata.csv")
doc_map = dict(zip(df_meta['doc_id'], df_meta['original_text']))

def search(query, top_k=5):
    terms = query.lower().split()
    doc_scores = Counter()

    for term in terms:
        doc_ids = inverted_index.get(term, [])
        for doc_id in doc_ids:
            doc_scores[doc_id] += 1

    top_docs = doc_scores.most_common(top_k)
    return [(doc_id, doc_map.get(doc_id, ""), score) for doc_id, score in top_docs]

# Example queries
queries = [
    "vaccine side effects",
    "eligibility criteria cancer trial",
    "trial design phase 3"
]

for query in queries:
    print("="*80)
    print(f"🔍 Query: {query}")
    results = search(query)
    for doc_id, text, score in results:
        print(f"\n🔹 Doc ID: {doc_id}  (Score: {score})")
        print(textwrap.fill(text, 100))

🔍 Query: vaccine side effects

🔹 Doc ID: NCT00000683  (Score: 2)
A Phase I Multicenter, Randomized, Double-Blind Trial to Evaluate the Safety and Immunogenicity of
Recombinant Vaccinia Virus Expressing the Envelope Glycoproteins of Human Immunodeficiency Virus
Evaluation of the safety and immunogenicity (immunological reactivity) of HIVAC-1e vaccine.       An
additional goal is to determine which dose level of vaccine might be most effective.       Specific
questions to be addressed in this part of the study include: Are there adverse       reactions to
gp160 vaccine when given to vaccinees previously immunized with a       vaccinia-recombinant? Does
gp160 vaccination of prior HIVAC-1e vaccine result in stimulation       of neutralizing antibody and
other humoral immune responses? Does vaccination with gp160       enhance the development of cell-
mediated immune responses in HIVAC-1e vaccinees? Is the       magnitude of immune response to gp160
booster immunization greater following pr

In [2]:
import ir_datasets

dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")

# Check what's available
print("Provides:", dataset)
print("\nFirst few queries:")
for q in dataset.queries_iter():
    print(q)
    break

print("\nFirst few qrels:")
for qrel in dataset.qrels_iter():
    print(qrel)
    break

Provides: Dataset(id='clinicaltrials/2017/trec-pm-2017', provides=['docs', 'queries', 'qrels'])

First few queries:


[INFO] [starting] https://trec.nist.gov/data/precmed/topics2017.xml
[INFO] [finished] https://trec.nist.gov/data/precmed/topics2017.xml: [00:00] [5.66kB] [?B/s]
                                                                          

TrecPm2017Query(query_id='1', disease='Liposarcoma', gene='CDK4 Amplification', demographic='38-year-old male', other='GERD')

First few qrels:


[INFO] [starting] https://trec.nist.gov/data/precmed/qrels-final-trials.txt
[INFO] [finished] https://trec.nist.gov/data/precmed/qrels-final-trials.txt: [00:00] [244kB] [735kB/s]
                                                                                    

TrecQrel(query_id='1', doc_id='NCT00001188', relevance=0, iteration='0')




In [3]:
import ir_datasets

dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")
qrels = dataset.qrels_iter()

# Save to TREC-compatible QREL file
with open("clinical_trials_qrels.txt", "w", encoding="utf-8") as f:
    for qrel in qrels:
        f.write(f"{qrel.query_id} 0 {qrel.doc_id} {qrel.relevance}\n")

print("✅ Saved QRELs to clinical_trials_qrels.txt")

✅ Saved QRELs to clinical_trials_qrels.txt


In [2]:
import pytrec_eval
import pandas as pd

# === Load QRELs ===
qrels = {}
with open("clinical_trials_qrels.txt", "r", encoding="utf-8") as f:
    for line in f:
        query_id, _, doc_id, relevance = line.strip().split()
        qrels.setdefault(query_id, {})[doc_id] = int(relevance)

# === Load TREC-style results ===
def load_run(file_path):
    run = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            qid, _, docid, rank, score, _ = line.strip().split()
            run.setdefault(qid, {})[docid] = float(score)
    return run

# === Load each run ===
runs = {
    "TF-IDF": load_run("clinical_trials_tfidf_results.txt"),
    "BM25": load_run("clinical_trials_bm25_results.txt"),
    "BERT": load_run("clinical_trials_bert_results.txt"),
    "Hybrid": load_run("clinical_trials_hybrid_results.txt"),
}

# === Define the correct metric names for pytrec_eval ===
metrics = {'map', 'recip_rank', 'P_10', 'recall_1000'}  # <- Fix here

evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)

# === Evaluate and print results ===
for name, run in runs.items():
    results = evaluator.evaluate(run)

    avg_metrics = {
        metric: sum(query_scores[metric] for query_scores in results.values()) / len(results)
        for metric in metrics
    }

    print(f"\n📊 Results for {name}")
    print(f"  MAP:         {avg_metrics['map']:.4f}")
    print(f"  MRR:         {avg_metrics['recip_rank']:.4f}")
    print(f"  Recall@1000: {avg_metrics['recall_1000']:.4f}")
    print(f"  Precision@10:{avg_metrics['P_10']:.4f}")


📊 Results for TF-IDF
  MAP:         0.0000
  MRR:         0.0000
  Recall@1000: 0.0000
  Precision@10:0.0000

📊 Results for BM25
  MAP:         0.0000
  MRR:         0.0000
  Recall@1000: 0.0000
  Precision@10:0.0000

📊 Results for BERT
  MAP:         0.0000
  MRR:         0.0000
  Recall@1000: 0.0000
  Precision@10:0.0000

📊 Results for Hybrid
  MAP:         0.0000
  MRR:         0.0000
  Recall@1000: 0.0000
  Precision@10:0.0000


In [2]:
import ir_datasets

dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")

with open("clinical_trials_queries.txt", "w", encoding="utf-8") as f:
    for query in dataset.queries_iter():
        # Concatenate fields to form the full query string
        text = f"{query.disease} {query.gene} {query.demographic}".strip()
        f.write(f"{query.query_id}\t{text}\n")

print("✅ Saved real clinical_trials_queries.txt from TREC PM 2017")

✅ Saved real clinical_trials_queries.txt from TREC PM 2017


In [5]:
import pandas as pd
import joblib
from sklearn.metrics.pairwise import cosine_similarity

# 🔧 Define custom tokenizer used during TF-IDF indexing
def whitespace_tokenizer(text):
    return text.split()

# 📂 Load metadata and TF-IDF components
df_meta = pd.read_csv("clinical_trials_metadata.csv")
tfidf_vectorizer = joblib.load("clinical_trials_vectorizer.pkl")
tfidf_matrix = joblib.load("clinical_trials_tfidf_matrix.pkl")

# 🔍 Load official queries
queries = pd.read_csv("clinical_trials_queries.txt", sep='\t', names=["query_id", "text"])

# 💾 Write ranked results to file
with open("clinical_trials_tfidf_results.txt", "w", encoding="utf-8") as f:
    for _, row in queries.iterrows():
        query_id, text = row["query_id"], row["text"]
        query_vec = tfidf_vectorizer.transform([text])
        scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
        top_indices = scores.argsort()[::-1][:1000]
        for rank, idx in enumerate(top_indices, start=1):
            doc_id = df_meta.iloc[idx]["doc_id"]
            f.write(f"{query_id} Q0 {doc_id} {rank} {scores[idx]:.4f} TFIDF\n")

print("✅ TF-IDF results saved to clinical_trials_tfidf_results.txt")

✅ TF-IDF results saved to clinical_trials_tfidf_results.txt


In [6]:
import pandas as pd
import joblib
from rank_bm25 import BM25Okapi

# Load data
df_meta = pd.read_csv("clinical_trials_metadata.csv")
queries = pd.read_csv("clinical_trials_queries.txt", sep="\t", names=["query_id", "text"])

# Load BM25 model and corpus
with open("clinical_trials_bm25_corpus.pkl", "rb") as f:
    corpus = joblib.load(f)
with open("clinical_trials_bm25_model.pkl", "rb") as f:
    bm25 = joblib.load(f)

doc_ids = df_meta['doc_id'].tolist()

with open("clinical_trials_bm25_results.txt", "w", encoding="utf-8") as f:
    for _, row in queries.iterrows():
        query_id, query_text = row['query_id'], row['text']
        tokenized_query = query_text.lower().split()
        scores = bm25.get_scores(tokenized_query)
        ranked_indices = scores.argsort()[::-1]

        for rank, idx in enumerate(ranked_indices[:1000]):
            f.write(f"{query_id} Q0 {doc_ids[idx]} {rank+1} {scores[idx]:.4f} BM25\n")

print("✅ BM25 results saved to clinical_trials_bm25_results.txt")

✅ BM25 results saved to clinical_trials_bm25_results.txt


In [7]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df_meta = pd.read_csv("clinical_trials_metadata.csv")
queries = pd.read_csv("clinical_trials_queries.txt", sep="\t", names=["query_id", "text"])
doc_embeddings = joblib.load("clinical_trials_bert_embeddings.pkl")

# Load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

with open("clinical_trials_bert_results.txt", "w", encoding="utf-8") as f:
    for _, row in queries.iterrows():
        query_id, query_text = row['query_id'], row['text']
        query_embedding = model.encode([query_text])
        similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
        ranked_indices = np.argsort(similarities)[::-1]

        for rank, idx in enumerate(ranked_indices[:1000]):
            f.write(f"{query_id} Q0 {df_meta['doc_id'].iloc[idx]} {rank+1} {similarities[idx]:.4f} BERT\n")

print("✅ BERT results saved to clinical_trials_bert_results.txt")

✅ BERT results saved to clinical_trials_bert_results.txt


In [8]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import minmax_scale

# Load data
df_meta = pd.read_csv("clinical_trials_metadata.csv")
queries = pd.read_csv("clinical_trials_queries.txt", sep="\t", names=["query_id", "text"])

# Load BM25
with open("clinical_trials_bm25_corpus.pkl", "rb") as f:
    corpus = joblib.load(f)
with open("clinical_trials_bm25_model.pkl", "rb") as f:
    bm25 = joblib.load(f)

# Load BERT
doc_embeddings = joblib.load("clinical_trials_bert_embeddings.pkl")
model = SentenceTransformer('all-MiniLM-L6-v2')

doc_ids = df_meta['doc_id'].tolist()

with open("clinical_trials_hybrid_results.txt", "w", encoding="utf-8") as f:
    for _, row in queries.iterrows():
        query_id, query_text = row['query_id'], row['text']
        tokenized_query = query_text.lower().split()
        query_embedding = model.encode([query_text])

        bm25_scores = bm25.get_scores(tokenized_query)
        cosine_scores = np.dot(doc_embeddings, query_embedding[0])

        # Normalize scores
        bm25_norm = minmax_scale(bm25_scores)
        cosine_norm = minmax_scale(cosine_scores)

        hybrid_scores = 0.5 * bm25_norm + 0.5 * cosine_norm
        ranked_indices = np.argsort(hybrid_scores)[::-1]

        for rank, idx in enumerate(ranked_indices[:1000]):
            f.write(f"{query_id} Q0 {doc_ids[idx]} {rank+1} {hybrid_scores[idx]:.4f} Hybrid\n")

print("✅ Hybrid results saved to clinical_trials_hybrid_results.txt")

✅ Hybrid results saved to clinical_trials_hybrid_results.txt


In [9]:
import pytrec_eval
import pandas as pd

# === Load QRELs ===
qrels = {}
with open("clinical_trials_qrels.txt", "r", encoding="utf-8") as f:
    for line in f:
        query_id, _, doc_id, relevance = line.strip().split()
        qrels.setdefault(query_id, {})[doc_id] = int(relevance)

# === Load TREC-style results ===
def load_run(file_path):
    run = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            qid, _, docid, rank, score, _ = line.strip().split()
            run.setdefault(qid, {})[docid] = float(score)
    return run

# === Load each run ===
runs = {
    "TF-IDF": load_run("clinical_trials_tfidf_results.txt"),
    "BM25": load_run("clinical_trials_bm25_results.txt"),
    "BERT": load_run("clinical_trials_bert_results.txt"),
    "Hybrid": load_run("clinical_trials_hybrid_results.txt"),
}

# === Define the correct metric names for pytrec_eval ===
metrics = {'map', 'recip_rank', 'P_10', 'recall_1000'}  # <- Fix here

evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)

# === Evaluate and print results ===
for name, run in runs.items():
    results = evaluator.evaluate(run)

    avg_metrics = {
        metric: sum(query_scores[metric] for query_scores in results.values()) / len(results)
        for metric in metrics
    }

    print(f"\n📊 Results for {name}")
    print(f"  MAP:         {avg_metrics['map']:.4f}")
    print(f"  MRR:         {avg_metrics['recip_rank']:.4f}")
    print(f"  Recall@1000: {avg_metrics['recall_1000']:.4f}")
    print(f"  Precision@10:{avg_metrics['P_10']:.4f}")


📊 Results for TF-IDF
  MAP:         0.0021
  MRR:         0.0085
  Recall@1000: 0.0441
  Precision@10:0.0000

📊 Results for BM25
  MAP:         0.2169
  MRR:         0.5537
  Recall@1000: 0.6865
  Precision@10:0.3138

📊 Results for BERT
  MAP:         0.0567
  MRR:         0.3763
  Recall@1000: 0.4365
  Precision@10:0.1207

📊 Results for Hybrid
  MAP:         0.1859
  MRR:         0.6060
  Recall@1000: 0.6965
  Precision@10:0.3034
