**Imports , NLKT Setup and Query Tokenizer (From Part 1)**


In [1]:
import pandas as pd
import numpy as np
import json
import math
from collections import defaultdict, Counter
import re, unicodedata
import os
import sys
from pathlib import Path
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from typing import List, Tuple, Dict

# --- NLTK Setup ---
try:
    stopwords.words("english")
except LookupError:
    import nltk
    nltk.download("stopwords")

_STEM = PorterStemmer()
_STOP = set(stopwords.words("english"))
_PUNCT = re.compile(r"[^\w\s]+", re.UNICODE)

def build_terms(text: str) -> List[str]:
    """Applies tokenization, stemming, and filtering to raw query."""
    if not isinstance(text, str):
        return []
    s = unicodedata.normalize("NFKC", text.lower())
    s = _PUNCT.sub(" ", s)
    toks = [t for t in s.split() if t not in _STOP]
    toks = [_STEM.stem(t) for t in toks]
    return [t for t in toks if len(t) > 1 and not t.isdigit()]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Inverted Index Building**

In [2]:
class InvertedIndex:
    """Stores DF, Posting List (pid, tf), and Document Length (L_d) for TF-IDF."""
    def __init__(self):
        self.index: Dict[str, Dict] = defaultdict(lambda: {'df': 0, 'postings': {}})
        self.doc_lengths: Dict[str, float] = {}
        self.num_docs: int = 0

    def build_from_dataframe(self, df: pd.DataFrame):
        """Builds index from Part 1 processed data."""
        print(f"Building index from {len(df)} documents...")

        for idx, row in df.iterrows():
            if idx % 1000 == 0:
                print(f"  Processed {idx}/{len(df)} documents...")

            pid = str(row['pid'])

            # Combine all token columns
            tokens = []
            if 'title_tokens' in row and isinstance(row['title_tokens'], list):
                tokens.extend(row['title_tokens'])
            if 'desc_tokens' in row and isinstance(row['desc_tokens'], list):
                tokens.extend(row['desc_tokens'])
            if 'details_tokens' in row and isinstance(row['details_tokens'], list):
                tokens.extend(row['details_tokens'])

            # Calculate TF and document length
            tf_counts = Counter(tokens)
            L_d = math.sqrt(sum(tf ** 2 for tf in tf_counts.values()))
            self.doc_lengths[pid] = L_d if L_d > 0 else 1.0

            # Update postings
            for term, tf in tf_counts.items():
                if pid not in self.index[term]['postings']:
                    self.index[term]['df'] += 1
                self.index[term]['postings'][pid] = tf

        self.num_docs = len(df)
        print(f"Index built: {self.num_docs} documents, {len(self.index)} unique terms.")

    def get_term_stats(self, term):
        """Retrieves statistics for a given term."""
        return self.index.get(term, {'df': 0, 'postings': {}})

**TF-IDF Ranking and Retrieval**

In [3]:
def calculate_tfidf_weight(tf, df, N):
    """Calculates TF-IDF weight using 1+log(tf) * log(N/df)."""
    tf_comp = 1 + math.log10(tf) if tf > 0 else 0
    idf_comp = math.log10(N / df) if df > 0 else 0
    return tf_comp * idf_comp


def ranked_search(query: str, index: InvertedIndex) -> List[Tuple[str, float]]:
    """Retrieves documents using AND logic and ranks with Cosine Similarity."""
    N = index.num_docs
    query_tokens = build_terms(query)

    if not query_tokens:
        return []

    # Strict AND intersection
    doc_sets = []
    for term in query_tokens:
        term_stats = index.get_term_stats(term)
        if term_stats['df'] == 0:
            return []  # Term not in index, no results
        doc_sets.append(set(term_stats['postings'].keys()))

    retrieved_pids = list(set.intersection(*doc_sets)) if doc_sets else []

    if not retrieved_pids:
        return []

    # Calculate scores
    scores = {}
    q_tf_counts = Counter(query_tokens)

    for pid in retrieved_pids:
        score = 0.0
        L_d = index.doc_lengths.get(pid, 1.0)

        for term in query_tokens:
            term_stats = index.get_term_stats(term)
            df = term_stats['df']
            tf_d = term_stats['postings'].get(pid, 0)
            tf_q = q_tf_counts[term]

            W_t_d = calculate_tfidf_weight(tf_d, df, N)
            W_t_q = calculate_tfidf_weight(tf_q, df, N)
            score += W_t_q * W_t_d

        scores[pid] = score / L_d

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

**Evaluation Metrics**

In [4]:
def get_relevance_labels(query_id, retrieved_pids, df_labels):
    """Returns ranked binary relevance scores (1/0) and R_total."""
    relevant_df = df_labels[df_labels['query_id'] == query_id]
    relevant_pids = set(relevant_df[relevant_df['relevance'] == 1]['pid'].astype(str))
    R_total = len(relevant_pids)
    relevance_scores = [1 if pid in relevant_pids else 0 for pid in retrieved_pids]
    return relevance_scores, R_total


# --- Evaluation Metrics ---
def precision_at_k(rel_scores, k):
    if k == 0 or not rel_scores: return 0.0
    k = min(k, len(rel_scores))
    return sum(rel_scores[:k]) / k

def recall_at_k(rel_scores, R_total, k):
    if R_total == 0: return 0.0
    k = min(k, len(rel_scores))
    return sum(rel_scores[:k]) / R_total

def f1_score_at_k(P_at_k, R_at_k):
    if P_at_k + R_at_k == 0: return 0.0
    return 2 * P_at_k * R_at_k / (P_at_k + R_at_k)

def average_precision_at_k(rel_scores, k):
    if not rel_scores: return 0.0
    k = min(k, len(rel_scores))
    sum_precisions = 0.0
    num_relevant = 0
    for i in range(k):
        if rel_scores[i] == 1:
            num_relevant += 1
            sum_precisions += num_relevant / (i + 1)
    return sum_precisions / num_relevant if num_relevant > 0 else 0.0

def mean_average_precision(aps_list):
    return sum(aps_list) / len(aps_list) if aps_list else 0.0

def mean_reciprocal_rank(rel_scores):
    for i, rel in enumerate(rel_scores):
        if rel == 1:
            return 1.0 / (i + 1)
    return 0.0

def ndcg_at_k(rel_scores, k):
    k = min(k, len(rel_scores))
    dcg = sum(rel_scores[i] / math.log2(i + 2) for i in range(k))
    ideal_scores = sorted(rel_scores, reverse=True)
    idcg = sum(ideal_scores[i] / math.log2(i + 2) for i in range(k))
    return dcg / idcg if idcg > 0.0 else 0.0

**Execution - Build Index and Define Queries**

In [7]:
current_dir = Path.cwd()
PROJECT_ROOT = current_dir

# Look for 'data' folder in current or parent directories
if not (PROJECT_ROOT / 'data').exists():
    if (PROJECT_ROOT.parent / 'data').exists():
        PROJECT_ROOT = PROJECT_ROOT.parent
    else:
        print("ERROR: Cannot find 'data' directory. Ensure you're running from project root.")
        sys.exit(1)

PROCESSED_DATA_FILE = PROJECT_ROOT / 'data' / 'processed' / 'products_clean.parquet'
LABELS_FILE = PROJECT_ROOT / 'data' / 'raw' / 'validation_labels.csv'
INDEX_DIR = PROJECT_ROOT / 'data' / 'index'
INDEX_FILE = INDEX_DIR / 'inverted_index.json'

# Create index directory if it doesn't exist
INDEX_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Looking for data at: {PROCESSED_DATA_FILE}")

# --- Load Data ---
if not PROCESSED_DATA_FILE.exists():
    print(f"ERROR: Processed data not found at {PROCESSED_DATA_FILE}")
    print("Run Part 1 preprocessing first!")
    sys.exit(1)

print("\n--- LOADING DATA ---")
df_clean = pd.read_parquet(PROCESSED_DATA_FILE)
print(f"Loaded {len(df_clean)} products")
print(f"Columns: {list(df_clean.columns)}")

# Load labels
if not LABELS_FILE.exists():
    print(f"ERROR: Labels file not found at {LABELS_FILE}")
    sys.exit(1)

df_labels = pd.read_csv(LABELS_FILE)
print(f"Loaded {len(df_labels)} relevance labels")

# --- Build Index ---
print("\n--- BUILDING INVERTED INDEX ---")
INDEX = InvertedIndex()
INDEX.build_from_dataframe(df_clean)

# Save index
print(f"\nSaving index to {INDEX_FILE}...")
with open(INDEX_FILE, 'w') as f:
    json.dump({
        'index': {k: dict(v) for k, v in INDEX.index.items()},
        'doc_lengths': INDEX.doc_lengths,
        'num_docs': INDEX.num_docs
    }, f)
print("Index saved successfully!")

# --- Define Queries ---
ALL_QUERIES = [
    {'query_id': 1, 'query': 'women full sleeve sweatshirt cotton'},
    {'query_id': 2, 'query': 'men slim jeans blue'},
    {'query_id': 3, 'query': 'long sleeve denim jacket blue'},
    {'query_id': 4, 'query': 'reeb shoe sport white'},
    {'query_id': 5, 'query': 'cheap men polo shirt black'},
    {'query_id': 6, 'query': 'tight fit short skirt women'},
    {'query_id': 7, 'query': 'low price formal trouser'},
]

Project root: /content
Looking for data at: /content/data/processed/products_clean.parquet
ERROR: Processed data not found at /content/data/processed/products_clean.parquet
Run Part 1 preprocessing first!


SystemExit: 1

**Evaluation 1 & 2 Results**

In [None]:
print("\n--- RUNNING EVALUATION ---")
RESULTS = []
AP_SCORES = []

for q_info in ALL_QUERIES:
    query_id = q_info['query_id']
    query_text = q_info['query']

    print(f"\nQuery {query_id}: '{query_text}'")

    # Search
    ranked_results = ranked_search(query_text, INDEX)
    retrieved_pids = [pid for pid, score in ranked_results]

    print(f"  Retrieved {len(retrieved_pids)} documents")

    # Get relevance labels
    relevance_scores, R_total = get_relevance_labels(query_id, retrieved_pids, df_labels)

    # Calculate metrics
    k = 10
    P_k = precision_at_k(relevance_scores, k)
    R_k = recall_at_k(relevance_scores, R_total, k)
    F1_k = f1_score_at_k(P_k, R_k)
    AP_k = average_precision_at_k(relevance_scores, k)
    MRR_score = mean_reciprocal_rank(relevance_scores)
    NDCG_k = ndcg_at_k(relevance_scores, k)

    AP_SCORES.append(AP_k)

    RESULTS.append({
        'Query ID': query_id,
        'Query': query_text,
        'R_Total': R_total,
        'Retrieved': len(retrieved_pids),
        'P@10': round(P_k, 3),
        'R@10': round(R_k, 3),
        'F1@10': round(F1_k, 3),
        'AP@10': round(AP_k, 3),
        'MRR': round(MRR_score, 3),
        'NDCG@10': round(NDCG_k, 3),
    })

# --- Display Results ---
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
df_results = pd.DataFrame(RESULTS)
print(df_results.to_string(index=False))

MAP_score = mean_average_precision(AP_SCORES)
print(f"\nMean Average Precision (MAP): {round(MAP_score, 3)}")

# Save results
results_file = PROJECT_ROOT / 'data' / 'results' / 'evaluation_results.csv'
results_file.parent.mkdir(parents=True, exist_ok=True)
df_results.to_csv(results_file, index=False)
print(f"\nResults saved to {results_file}")

**Ground Truth for New Queries**

In [None]:

# The assignment requires you to manually define the ground truth for Q3-Q7
# and update your validation_labels.csv file.

# ACTION REQUIRED:
# 1. Inspect the PIDs retrieved by the search engine for Queries 3 through 7.
# 2. For those PIDs, manually judge relevance (1 or 0).
# 3. Add these new relevance judgments to your data/raw/validation_labels.csv file.
# 4. Include a detailed table of your manual judgments in your final PDF report.

# Example:
# Query 3: 'long sleeve denim jacket blue'
# - PID_12345: Relevant (1) because it is a denim jacket.
# - PID_67890: Not Relevant (0) because it is a denim dress.
