**Imports , NLKT Setup and Query Tokenizer (From Part 1)**


In [1]:
import pandas as pd
import numpy as np
import json
import math
from collections import defaultdict, Counter
import re, unicodedata
import os
import sys 

# --- NLTK Components for Tokenization ---
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# Asume que NLTK ya está instalado y descargado (como en tu Notebook de Parte 1)

_STEM = PorterStemmer()
_STOP = set(stopwords.words("english"))
_PUNCT = re.compile(r"[^\w\s]+", re.UNICODE)

def _norm(s: str) -> str:
    """Normalization utility from Part 1 (used for categorical/numeric fields)."""
    if not isinstance(s, str): return ""
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^\w\s]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def build_terms(text: str) -> list[str]:
    """Applies Part 1 preprocessing to the query (tokenize, stem, filter)."""
    if not isinstance(text, str): return []
    s = unicodedata.normalize("NFKC", text.lower())
    s = _PUNCT.sub(" ", s)
    toks = [t for t in s.split() if t not in _STOP]
    toks = [_STEM.stem(t) for t in toks]
    # Retiene caracteres individuales no numéricos (fix para H&M)
    return [t for t in toks if not t.isdigit()]

**Inverted Index Building**

In [2]:
class InvertedIndex:
    """Stores DF, Posting List (pid, tf), and Document Length (L_d)."""
    def __init__(self):
        # Index: { term: { 'df': int, 'postings': { pid: tf } } }
        self.index = defaultdict(lambda: {'df': 0, 'postings': {}})
        # L_d: { pid: L_d } (Euclidean norm for VSM/Cosine Similarity)
        self.doc_lengths = {}
        self.num_docs = 0

    def add_document(self, doc_id: str, tokens: list[str]):
        """Calculates TF and L_d, and adds terms to postings."""
        tf_counts = Counter(tokens)
        
        # L_d (Euclidean norm of the raw term vector)
        L_d = math.sqrt(sum(tf_counts[term]**2 for term in tf_counts))
        self.doc_lengths[doc_id] = L_d
        
        # Update postings list and document frequency (df)
        for term, tf in tf_counts.items():
            if doc_id not in self.index[term]['postings']:
                self.index[term]['df'] += 1
            self.index[term]['postings'][doc_id] = tf
        
        self.num_docs += 1

    def build_from_dataframe(self, df: pd.DataFrame):
        """Builds index from Part 1 processed data."""
        self.num_docs = len(df)
        
        for index, row in df.iterrows():
            pid = row['pid']
            # Accessing the pre-processed token columns
            tokens = row['title_tokens'] + row['desc_tokens'] + row.get('details_tokens', [])
            self.add_document(pid, tokens)
        
        print(f"Index built with {self.num_docs} documents and {len(self.index)} terms.")
    
    def get_term_stats(self, term):
        """Retrieves statistics for a given term."""
        return self.index.get(term, {'df': 0, 'postings': {}})

**TF-IDF Ranking and Retrieval**

In [3]:
def calculate_tfidf_weight(tf, df, N):
    """Calculates the W_t,d or W_t,q TF-IDF weight."""
    # TF Component: 1 + log(tf) (Log-frequency weighting)
    tf_comp = 1 + math.log10(tf) if tf > 0 else 0
    # IDF Component: log(N/df)
    idf_comp = math.log10(N / df) if df > 0 else 0
    return tf_comp * idf_comp

def ranked_search(query: str, index: InvertedIndex) -> list[tuple[str, float]]:
    """
    Retrieves documents using strict AND logic and ranks them using Cosine Similarity.
    """
    N = index.num_docs
    # Use the existing tokenizer for query
    query_tokens = build_terms(query) 
    if not query_tokens: return []
    
    # --- 1. Retrieval (Conjunctive AND Logic) ---
    doc_sets = []
    for term in query_tokens:
        postings = index.get_term_stats(term)['postings']
        if not postings:
            return [] # Empty result if any term is missing
        doc_sets.append(set(postings.keys()))
        
    retrieved_pids = list(set.intersection(*doc_sets))
    
    # --- 2. Ranking (Vector Space Model: Cosine Similarity) ---
    scores = defaultdict(float)
    q_tf_counts = Counter(query_tokens)
    
    for pid in retrieved_pids:
        score = 0
        L_d = index.doc_lengths.get(pid, 1.0)
        
        # Calculate Dot Product: sum(W_t,q * W_t,d)
        for term in query_tokens:
            term_stats = index.get_term_stats(term)
            df = term_stats['df']
            
            # W_t,d (Document weight)
            tf_d = term_stats['postings'].get(pid, 0)
            W_t_d = calculate_tfidf_weight(tf_d, df, N)
            
            # W_t,q (Query weight)
            tf_q = q_tf_counts[term]
            W_t_q = calculate_tfidf_weight(tf_q, df, N)

            score += W_t_q * W_t_d
            
        # Cosine Similarity (Score = Dot Product / L_d)
        scores[pid] = score / L_d
        
    # --- 3. Sort Results ---
    return sorted(scores.items(), key=lambda item: item[1], reverse=True)

**Evaluation Metrics**

In [4]:
def get_relevance_labels(query_id, retrieved_pids, df_labels):
    """Returns ranked binary relevance scores (1/0) and R_total."""
    relevant_labels_df = df_labels[df_labels['query_id'] == query_id]
    relevant_pids = set(relevant_labels_df[relevant_labels_df['relevance'] == 1]['pid'])
    R_total = len(relevant_pids)
    relevance_scores = [1 if pid in relevant_pids else 0 for pid in retrieved_pids]
    return relevance_scores, R_total

# --- Standard Cutoff Metrics ---
def precision_at_k(rel_scores, k):
    """P@K (Required i)"""
    if k == 0 or not rel_scores: return 0.0
    k = min(k, len(rel_scores))
    return sum(rel_scores[:k]) / k

def recall_at_k(rel_scores, R_total, k):
    """R@K (Required ii)"""
    if R_total == 0: return 0.0
    k = min(k, len(rel_scores))
    return sum(rel_scores[:k]) / R_total

def f1_score_at_k(P_at_k, R_at_k):
    """F1-Score@K (Required iv)"""
    if P_at_k + R_at_k == 0: return 0.0
    return 2 * P_at_k * R_at_k / (P_at_k + R_at_k)

# --- Ranking Metrics ---
def average_precision_at_k(rel_scores, k):
    """AP@K (Required iii)"""
    if not rel_scores: return 0.0
    k = min(k, len(rel_scores))
    sum_of_precisions = 0.0
    num_relevant = 0
    for i in range(k):
        if rel_scores[i] == 1:
            num_relevant += 1
            sum_of_precisions += num_relevant / (i + 1)
    return sum_of_precisions / num_relevant if num_relevant > 0 else 0.0

def mean_average_precision(aps_list):
    """MAP (Required v)"""
    return sum(aps_list) / len(aps_list) if aps_list else 0.0

def mean_reciprocal_rank(rel_scores: list):
    """MRR (Required vi)"""
    for i, rel in enumerate(rel_scores):
        if rel == 1:
            return 1.0 / (i + 1)
    return 0.0

def ndcg_at_k(rel_scores, k):
    """NDCG@K (Required vii)"""
    k = min(k, len(rel_scores))
    
    # DCG (Actual Ranking)
    dcg = sum(rel_scores[i] / math.log2(i + 2) for i in range(k))
    
    # IDCG (Ideal Ranking)
    ideal_scores = sorted(rel_scores, reverse=True)
    idcg = sum(ideal_scores[i] / math.log2(i + 2) for i in range(k))
        
    return dcg / idcg if idcg > 0.0 else 0.0

**Execution - Build Index and Define Queries**

In [5]:
# --- A. Setup ---
INDEX_FILE = 'data/index/inverted_index.json'
PROCESSED_DATA_FILE = 'data/processed/products_clean.parquet'
LABELS_FILE = 'data/raw/validation_labels.csv'

os.makedirs(os.path.dirname(INDEX_FILE), exist_ok=True)

# 1. Load ground truth labels
df_labels = pd.read_csv(LABELS_FILE)

# --- B. Index Construction ---
print("--- 1. BUILDING INVERTED INDEX ---")

# Check for processed file existence 
if not os.path.exists(PROCESSED_DATA_FILE):
    print(f"Error: Processed data not found at {PROCESSED_DATA_FILE}. Run Part 1 script.")
    sys.exit()

df_clean = pd.read_parquet(PROCESSED_DATA_FILE)

INDEX = InvertedIndex()
INDEX.build_from_dataframe(df_clean)

# Save index 
with open(INDEX_FILE, 'w') as f:
    json.dump({'index': dict(INDEX.index), 'doc_lengths': dict(INDEX.doc_lengths)}, f)

print(f"Index saved to {INDEX_FILE}. Documents: {INDEX.num_docs}")

# --- C. Query Definitions (Rubric: Propose test queries - 1 point) ---
QUERY_1 = "women full sleeve sweatshirt cotton"
QUERY_2 = "men slim jeans blue"

# New test queries (5 new queries)
NEW_QUERIES_LIST = [
    {"query_id": 3, "query": "long sleeve denim jacket blue"}, 
    {"query_id": 4, "query": "reeb shoe sport white"},       
    {"query_id": 5, "query": "cheap men polo shirt black"},   
    {"query_id": 6, "query": "tight fit short skirt women"},  
    {"query_id": 7, "query": "low price formal trouser"},     
]
ALL_QUERIES = [
    {'query_id': 1, 'query': QUERY_1},
    {'query_id': 2, 'query': QUERY_2}
] + NEW_QUERIES_LIST


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/validation_labels.csv'

**Evaluation 1 & 2 Results**

In [7]:
print("\n--- 2. RUNNING RANKED SEARCH AND EVALUATION ---")

RESULTS = []
AP_SCORES = [] 

for q_info in ALL_QUERIES:
    query_id = q_info['query_id']
    query_text = q_info['query']
    
    # Run the TF-IDF Ranked Search (AND retrieval)
    ranked_pids_scores = ranked_search(query_text, INDEX)
    retrieved_pids = [pid for pid, score in ranked_pids_scores]

    # --- Get Relevance Labels ---
    relevance_scores, R_total = get_relevance_labels(query_id, retrieved_pids, df_labels)

    # --- Calculate Metrics (Cutoff K=10) ---
    k = 10 
    
    P_k = precision_at_k(relevance_scores, k)
    R_k = recall_at_k(relevance_scores, R_total, k)
    F1_k = f1_score_at_k(P_k, R_k)
    AP_k = average_precision_at_k(relevance_scores, k)
    MRR_score = mean_reciprocal_rank(relevance_scores)
    NDCG_k = ndcg_at_k(relevance_scores, k)
    
    AP_SCORES.append(AP_k)
    
    # Collect results
    RESULTS.append({
        'Query ID': query_id,
        'Query Text': query_text,
        'R_Total': R_total,
        'Retrieved': len(retrieved_pids),
        'P@10': round(P_k, 3),
        'R@10': round(R_k, 3),
        'F1@10': round(F1_k, 3),
        'AP@10': round(AP_k, 3),
        'MRR': round(MRR_score, 3),
        'NDCG@10': round(NDCG_k, 3),
    })

# --- D. Final Results and MAP Calculation ---
df_results = pd.DataFrame(RESULTS)
MAP_score = mean_average_precision(AP_SCORES)

print("\n--- 3. FINAL EVALUATION METRICS (Rounded to 3 decimals) ---")
# The 'display' function is often used in Jupyter Notebooks
# If run outside, uncomment: print(df_results)
try:
    from IPython.display import display
    display(df_results)
except ImportError:
    print(df_results)

print(f"\nMean Average Precision (MAP) across all {len(ALL_QUERIES)} queries: {round(MAP_score, 3)}")


--- 2. RUNNING RANKED SEARCH AND EVALUATION ---


NameError: name 'ALL_QUERIES' is not defined

**Ground Truth for New Queries**

In [8]:

# The assignment requires you to manually define the ground truth for Q3-Q7 
# and update your validation_labels.csv file. 

# ACTION REQUIRED:
# 1. Inspect the PIDs retrieved by the search engine for Queries 3 through 7.
# 2. For those PIDs, manually judge relevance (1 or 0).
# 3. Add these new relevance judgments to your data/raw/validation_labels.csv file.
# 4. Include a detailed table of your manual judgments in your final PDF report.

# Example:
# Query 3: 'long sleeve denim jacket blue'
# - PID_12345: Relevant (1) because it is a denim jacket.
# - PID_67890: Not Relevant (0) because it is a denim dress.
