### Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from sklearn.metrics import cohen_kappa_score


# Load Corpus & Queries

In [None]:
with open('../data/corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.readlines()

with open('../data/queries.txt', 'r', encoding='utf-8') as f:
    queries = f.readlines()

# TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(corpus)

# Cosine Similarity for Top-10 Documents

In [None]:
def get_top_k_documents(query, doc_vectors, k=10):
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, doc_vectors).flatten()
    top_k_indices = cosine_sim.argsort()[-k:][::-1]
    return top_k_indices, cosine_sim[top_k_indices]

#  Evaluate Precision@K and MAP

In [None]:
def precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    precision = len(set(relevant_docs).intersection(set(retrieved_at_k))) / k
    return precision

def mean_average_precision(queries_relevant_docs, queries_retrieved_docs):
    ap_scores = []
    for rel_docs, ret_docs in zip(queries_relevant_docs, queries_retrieved_docs):
        hits = 0
        sum_precisions = 0
        for i, doc in enumerate(ret_docs):
            if doc in rel_docs:
                hits += 1
                sum_precisions += hits / (i + 1)
        ap = sum_precisions / len(rel_docs) if rel_docs else 0
        ap_scores.append(ap)
    return np.mean(ap_scores)

# Dummy relevant documents for demo (replace with actual relevance judgments)
relevant_docs_list = [[0, 3, 5], [2, 7], [1, 4, 6]]

retrieved_docs_list = []
for q in queries:
    top_docs, _ = get_top_k_documents(q, doc_vectors, k=10)
    retrieved_docs_list.append(top_docs.tolist())

# Compute Precision@5,6,10 and MAP
for k in [5, 6, 10]:
    precisions = [precision_at_k(rel, ret, k) for rel, ret in zip(relevant_docs_list, retrieved_docs_list)]
    print(f"Precision@{k}: {np.mean(precisions):.3f}")

map_score = mean_average_precision(relevant_docs_list, retrieved_docs_list)
print(f"Mean Average Precision (MAP): {map_score:.3f}")

# Compute MRR (Mean Reciprocal Rank)

In [None]:
def compute_mrr(relevant_docs_list, retrieved_docs_list):
    rr_scores = []
    for rel_docs, ret_docs in zip(relevant_docs_list, retrieved_docs_list):
        rr = 0
        for rank, doc in enumerate(ret_docs, start=1):
            if doc in rel_docs:
                rr = 1 / rank
                break
        rr_scores.append(rr)
    return np.mean(rr_scores)

mrr = compute_mrr(relevant_docs_list, retrieved_docs_list)
print(f"Mean Reciprocal Rank (MRR): {mrr:.3f}")

# Inter-Annotator Agreement (Cohen's Kappa)

In [None]:
# Example Annotations (Dummy Data)
annotator1 = pd.read_csv('../annotations/annotator1.csv')  # Columns: doc_id, label
annotator2 = pd.read_csv('../annotations/annotator2.csv')  # Columns: doc_id, label

kappa = cohen_kappa_score(annotator1['label'], annotator2['label'])
print(f"Cohen's Kappa: {kappa:.3f}")

# Cell 8: Discussion on Kappa Score
if kappa >= 0.80:
    agreement_level = "Very Good Agreement"
elif 0.60 <= kappa < 0.80:
    agreement_level = "Substantial Agreement"
elif 0.40 <= kappa < 0.60:
    agreement_level = "Moderate Agreement"
else:
    agreement_level = "Poor Agreement"

print(f"Agreement Level: {agreement_level}")
print("To improve Kappa, we can refine annotation guidelines, conduct calibrations, and resolve ambiguous definitions.")