In [6]:
# Clone and compile trec_eval
!git clone https://github.com/usnistgov/trec_eval.git
!cd trec_eval && make


Cloning into 'trec_eval'...
remote: Enumerating objects: 1147, done.[K
remote: Counting objects: 100% (332/332), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 1147 (delta 264), reused 277 (delta 226), pack-reused 815 (from 1)[K
Receiving objects: 100% (1147/1147), 764.18 KiB | 9.67 MiB/s, done.
Resolving deltas: 100% (769/769), done.
gcc -g -I.  -Wall -Wno-macro-redefined -DVERSIONID=\"10.0-rc2\"  -o trec_eval trec_eval.c formats.c meas_init.c meas_acc.c meas_avg.c meas_print_single.c meas_print_final.c gain_init.c get_qrels.c get_trec_results.c get_prefs.c get_qrels_prefs.c get_qrels_jg.c form_res_rels.c form_res_rels_jg.c form_prefs_counts.c utility_pool.c get_zscores.c convert_zscores.c measures.c  m_map.c m_P.c m_num_q.c m_num_ret.c m_num_rel.c m_num_rel_ret.c m_gm_map.c m_Rprec.c m_recip_rank.c m_bpref.c m_iprec_at_recall.c m_recall.c m_Rprec_mult.c m_utility.c m_11pt_avg.c m_ndcg.c m_ndcg_cut.c m_Rndcg.c m_ndcg_rel.c m_binG.c m_G.c m_rel_P.c m_suc

In [18]:
import pandas as pd
import os

# Define columns and original filenames
columns = ["query_id", "Q0", "doc_id", "rank", "score", "tag"]
result_files = {
    "Title + Text": ("Results_title_text.txt", True),
    "MiniLM Rerank": ("Results_miniLM.txt", False),
    "USELM Rerank": ("Results_useLM.txt", False),
    "doc2vec Rerank": ("Results_doc2vec.txt", False)
}

# Clean and save
for label, (original_file, has_header) in result_files.items():
    cleaned_file = f"/content/cleaned_{original_file}"
    try:
        df = pd.read_csv(
            f"/content/{original_file}",
            sep=r"\s+",
            header=0 if has_header else None,
            names=columns
        )
        df["query_id"] = df["query_id"].astype(str).str.replace("Q0-", "", regex=False)
        df.to_csv(cleaned_file, sep=" ", index=False, header=False)
        print(f"Cleaned {label} → {cleaned_file}")
    except Exception as e:
        print(f"Failed to process {original_file}: {e}")

# Evaluate with trec_eval
for label, (original_file, _) in result_files.items():
    cleaned_file = f"cleaned_{original_file}"
    print(f"\n{label}")
    if os.path.exists(f"/content/{cleaned_file}"):
        !cd trec_eval && ./trec_eval /content/test.qrel /content/{cleaned_file}
    else:
        print(f"File /content/{cleaned_file} not found. Skipping evaluation.")


Cleaned Title + Text → /content/cleaned_Results_title_text.txt
Cleaned MiniLM Rerank → /content/cleaned_Results_miniLM.txt
Cleaned USELM Rerank → /content/cleaned_Results_useLM.txt
Cleaned doc2vec Rerank → /content/cleaned_Results_doc2vec.txt

Title + Text
runid                 	all	a06e8ec5-c2ec-443b-8cc5-142febf16054
num_q                 	all	300
num_ret               	all	29933
num_rel               	all	339
num_rel_ret           	all	304
map                   	all	0.4842
gm_map                	all	0.1139
Rprec                 	all	0.3679
bpref                 	all	0.8949
recip_rank            	all	0.4957
iprec_at_recall_0.00  	all	0.4959
iprec_at_recall_0.10  	all	0.4959
iprec_at_recall_0.20  	all	0.4959
iprec_at_recall_0.30  	all	0.4958
iprec_at_recall_0.40  	all	0.4936
iprec_at_recall_0.50  	all	0.4915
iprec_at_recall_0.60  	all	0.4915
iprec_at_recall_0.70  	all	0.4894
iprec_at_recall_0.80  	all	0.4771
iprec_at_recall_0.90  	all	0.4733
iprec_at_recall_1.00  	all	0.4733
P_5      

In [13]:
def print_top10(df, query_id):
    print(f"\nTop 10 documents for Query ID {query_id}:")
    top10 = df[df["query_id"] == str(query_id)].sort_values("rank").head(10)
    for i, row in enumerate(top10.itertuples(), 1):
        print(f"{i}. {row.doc_id}")


# MiniLM reranked results
miniLM_df = pd.read_csv('/content/Results_miniLM.txt', sep='\s+', header=None,
                        names=["query_id", "Q0", "doc_id", "rank", "score", "tag"])
miniLM_df['query_id'] = miniLM_df['query_id'].astype(str)

print_top10(miniLM_df, 1)
print_top10(miniLM_df, 3)



Top 10 documents for Query ID 1:
1. 10786948
2. 35008773
3. 16287725
4. 7581911
5. 32001951
6. 42421723
7. 10342807
8. 825728
9. 4430962
10. 680949

Top 10 documents for Query ID 3:
1. 4414547
2. 12271486
3. 4632921
4. 23389795
5. 4378885
6. 19058822
7. 10145528
8. 14717500
9. 3823862
10. 32181055
