<a href="https://colab.research.google.com/github/TalissaMoura/AmazonC4DatasetAugmented/blob/main/3_pipeline_for_calculate_the_ranking_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 3 - Pipeline for measure the ranking metrics for queries

# 3.1 - Imports



In [1]:
import os, json, re, torch, pathlib, time, shutil
from huggingface_hub import login,snapshot_download, upload_file, HfApi,hf_hub_download
from huggingface_hub.hf_api import HfApi
from huggingface_hub.errors import EntryNotFoundError
import transformers
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from google.colab import userdata
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow.json as pj
from pydantic import BaseModel,ValidationError
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Tuple, Optional, Dict
import torch
from datetime import datetime
import random
import tqdm

# Configura√ß√µes
hf_token = userdata.get('HF_TOKEN')
hf_api = HfApi(token=hf_token)

# 3.2 - Functions


## 3.2.1 - NDCG functions

In [2]:
def dcg_at_k(relevances: List[float], k: int) -> float:
    """
    Calcula o Discounted Cumulative Gain at k.

    Args:
        relevances: Lista de relev√¢ncias (1 para relevante, 0 para n√£o relevante)
        k: Posi√ß√£o at√© onde calcular o DCG

    Returns:
        DCG@k value
    """
    relevances = np.array(relevances[:k])
    if relevances.size == 0:
        return 0.0

    # DCG = sum(rel_i / log2(i + 1)) for i in [0, k)
    discounts = np.log2(np.arange(2, relevances.size + 2))
    return np.sum(relevances / discounts)


def ndcg_at_k(relevances: List[float], k: int) -> float:
    """
    Calcula o Normalized Discounted Cumulative Gain at k.

    Args:
        relevances: Lista de relev√¢ncias ordenada pelo ranking do modelo
        k: Posi√ß√£o at√© onde calcular o NDCG

    Returns:
        NDCG@k value (entre 0 e 1)
    """
    dcg = dcg_at_k(relevances, k)

    # Ideal DCG: ordenar as relev√¢ncias em ordem decrescente
    ideal_relevances = sorted(relevances, reverse=True)
    idcg = dcg_at_k(ideal_relevances, k)

    if idcg == 0:
        return 0.0

    return dcg / idcg


def sample_candidate_items(
    ground_truth_item: Dict,
    all_items: List[Dict],
    n_in_domain: int = 50,
    random_seed: int = 42
) -> List[Dict]:
    """
    Amostra itens candidatos seguindo a metodologia do paper BLAIR.

    Args:
        ground_truth_item: Item verdadeiro (ground truth)
        all_items: Lista com todos os itens dispon√≠veis
        n_in_domain: N√∫mero de itens do mesmo dom√≠nio para amostrar
        random_seed: Seed para reprodutibilidade

    Returns:
        Lista com 100 itens candidatos (1 ground truth + 99 distractors)
    """
    rng = random.Random(random_seed)

    gt_category = ground_truth_item['category']
    gt_item_id = ground_truth_item['item_id']

    # Separar itens por dom√≠nio
    in_domain_items = [
        item for item in all_items
        if item['category'] == gt_category and item['item_id'] != gt_item_id
    ]

    out_domain_items = [
        item for item in all_items
        if item['category'] != gt_category
    ]

    # Amostrar 50 itens do mesmo dom√≠nio (excluindo o ground truth)
    n_in_domain_to_sample = min(n_in_domain, len(in_domain_items))
    sampled_in_domain = rng.sample(in_domain_items, n_in_domain_to_sample)

    # Amostrar os itens restantes de outros dom√≠nios
    n_out_domain_to_sample = 99 - n_in_domain_to_sample  # 100 total - 1 ground truth
    n_out_domain_to_sample = min(n_out_domain_to_sample, len(out_domain_items))
    sampled_out_domain = rng.sample(out_domain_items, n_out_domain_to_sample)

    # Combinar todos os candidatos
    candidates = [ground_truth_item] + sampled_in_domain + sampled_out_domain

    return candidates


def calculate_ndcg_for_query(
    query_embedding: np.ndarray,
    ground_truth_item_id: str,
    candidate_items: List[Dict],
    item_embeddings: Dict[str, np.ndarray],
    k: int = 100
) -> float:
    """
    Calcula NDCG@k para uma √∫nica query.

    Args:
        query_embedding: Embedding da query
        ground_truth_item_id: ID do item ground truth
        candidate_items: Lista de itens candidatos
        item_embeddings: Dicion√°rio mapeando item_id -> embedding
        k: Posi√ß√£o k para calcular NDCG

    Returns:
        NDCG@k value
    """
    # Calcular similaridade (cosine similarity) entre query e cada item
    scores = []
    item_ids = []

    for item in candidate_items:
        item_id = item['item_id']
        if item_id not in item_embeddings:
            continue

        item_emb = item_embeddings[item_id]

        # Cosine similarity
        similarity = np.dot(query_embedding, item_emb) / (
            np.linalg.norm(query_embedding) * np.linalg.norm(item_emb)
        )

        scores.append(similarity)
        item_ids.append(item_id)

    # Ordenar itens por score (decrescente)
    sorted_indices = np.argsort(scores)[::-1]
    sorted_item_ids = [item_ids[i] for i in sorted_indices]

    # Criar lista de relev√¢ncias (1 para ground truth, 0 para outros)
    relevances = [1.0 if item_id == ground_truth_item_id else 0.0
                  for item_id in sorted_item_ids]

    # Calcular NDCG@k
    return ndcg_at_k(relevances, k)

def evaluate_amazon_c4(
    queries: List[Dict],
    item_metadata: List[Dict],
    query_embeddings: Dict[str, np.ndarray],
    item_embeddings: Dict[str, np.ndarray],
    k: int = 100,
    random_seed: int = 42
) -> Dict[str, float]:
    """
    Avalia o modelo completo no dataset Amazon-C4.

    Args:
        queries: Lista de queries com formato:
                 [{'query_id': ..., 'query_text': ..., 'item_id': ..., 'category': ...}, ...]
        item_metadata: Lista de metadados dos itens do sampled_item_metadata_1M.jsonl
        query_embeddings: Dicion√°rio mapeando query_id -> embedding
        item_embeddings: Dicion√°rio mapeando item_id -> embedding
        k: Posi√ß√£o k para calcular NDCG (padr√£o: 100)
        random_seed: Seed para reprodutibilidade

    Returns:
        Dicion√°rio com m√©tricas:
        - 'ndcg@100_overall': NDCG@100 geral
        - 'ndcg@100_per_category': NDCG@100 por categoria
        - 'num_queries': N√∫mero total de queries avaliadas
    """
    ndcg_scores = []
    category_scores = {}

    for query in queries:
        query_id = query['query_id']
        gt_item_id = query['item_id']
        category = query['category']

        # Verificar se temos embeddings para esta query
        if query_id not in query_embeddings:
            continue

        # Encontrar o item ground truth nos metadados
        gt_item = None
        for item in item_metadata:
            if item['item_id'] == gt_item_id:
                gt_item = item
                break

        if gt_item is None:
            continue

        # Amostrar candidatos
        candidates = sample_candidate_items(
            gt_item,
            item_metadata,
            n_in_domain=50,
            random_seed=random_seed
        )

        # Calcular NDCG para esta query
        ndcg = calculate_ndcg_for_query(
            query_embeddings[query_id],
            gt_item_id,
            candidates,
            item_embeddings,
            k=k
        )

        ndcg_scores.append(ndcg)

        # Acumular por categoria
        if category not in category_scores:
            category_scores[category] = []
        category_scores[category].append(ndcg)

    # Calcular m√©dias
    results = {
        'ndcg@100_overall': np.mean(ndcg_scores) if ndcg_scores else 0.0,
        'ndcg@100_per_category': {
            cat: np.mean(scores) for cat, scores in category_scores.items()
        },
        'num_queries': len(ndcg_scores)
    }

    return results

## 3.2.2 - Load the files

In [3]:
def load_amazonc4_table() -> pa.Table:
    """
    Carrega o dataset Amazon-C4 do Hugging Face e retorna como PyArrow Table.

    Returns:
        PyArrow Table com o dataset Amazon-C4
    """
    print("Carregando Amazon-C4 do Hugging Face...")
    dataset = load_dataset('McAuley-Lab/Amazon-C4')['test']

    # Converter para PyArrow Table
    amazonc4_table = dataset.data.table

    print(f"Amazon-C4 carregado: {len(amazonc4_table)} queries")
    print(f"Colunas: {amazonc4_table.column_names}")
    print(f"Schema: {amazonc4_table.schema}")

    return amazonc4_table


def load_item_metadata_table() -> pa.Table:
    """
    Carrega os metadados dos itens (sampled_item_metadata_1M.jsonl)
    do Hugging Face e retorna como PyArrow Table.

    Returns:
        PyArrow Table com os metadados dos itens
    """
    print("Baixando sampled_item_metadata_1M.jsonl do Hugging Face...")
    filepath = hf_hub_download(
        repo_id='McAuley-Lab/Amazon-C4',
        filename='sampled_item_metadata_1M.jsonl',
        repo_type='dataset'
    )

    print(f"Arquivo baixado em: {filepath}")
    print("Carregando metadados dos itens...")

    # Carregar usando PyArrow JSON reader (mais eficiente)
    item_metadata_table = pj.read_json(filepath)

    print(f"Metadados carregados: {len(item_metadata_table)} itens")
    print(f"Colunas: {item_metadata_table.column_names}")
    print(f"Schema: {item_metadata_table.schema}")

    return item_metadata_table


def join_amazonc4_with_metadata_arrow(
    amazonc4_table: pa.Table,
    item_metadata_table: pa.Table,
    query_item_col: str ="item_id",
    metadata_item_col: str = 'item_id',
    metadata_item_category:str="category"
) -> pa.Table:
    """
    Faz o join entre o Amazon-C4 e os metadados dos itens usando PyArrow.
    Apenas adiciona a coluna 'metadata' do item_metadata_table.

    Args:
        amazonc4_table: Table com as queries do Amazon-C4
        item_metadata_table: Table com os metadados dos itens
        query_item_col: Nome da coluna no amazonc4_table que cont√©m o item_id
        metadata_item_col: Nome da coluna no item_metadata_table que cont√©m o item_id,
        metatada_item_category: Nome da categoria do item definido nos metadados.

    Returns:
        PyArrow Table com as queries e a coluna metadata dos itens
    """
    # Selecionar apenas item_id e metadata
    item_metadata_filtered = item_metadata_table.select([metadata_item_col, 'metadata'])

    # Renomear a coluna metadata para item_metadata
    item_metadata_filtered = item_metadata_filtered.rename_columns([metadata_item_col, 'item_metadata'])

    # Salvar categoria do item
    item_category_filtered = item_metadata_table.select([metadata_item_col,'category'])

    # Fazer o left join
    merged_table = amazonc4_table.join(
        item_metadata_filtered,
        keys=query_item_col,
        right_keys=metadata_item_col,
        join_type='left outer'
    )

    final_table = merged_table.join(
        item_category_filtered,
        keys=query_item_col,
        right_keys=metadata_item_col,
        join_type = "left outer")

    return final_table


def prepare_amazonc4_with_metadata_arrow(
    extract_title_desc: bool = False,
    output_path: Optional[str] = None
) -> pa.Table:
    """
    Fun√ß√£o completa que carrega Amazon-C4 e metadados do Hugging Face
    e une apenas a coluna 'metadata' usando PyArrow.

    Args:
        extract_title_desc: Se True, separa metadata em title e description (deprecated)
        output_path: Se fornecido, salva o resultado em Parquet

    Returns:
        PyArrow Table completo com queries e coluna item_metadata
    """
    # Carregar Amazon-C4
    amazonc4_table = load_amazonc4_table()

    print()

    # Carregar metadados dos itens
    metadata_table = load_item_metadata_table()

    # Fazer o join (agora s√≥ pega a coluna metadata)
    print("\nFazendo join entre queries e metadados...")
    result_table = join_amazonc4_with_metadata_arrow(amazonc4_table, metadata_table)

    # Estat√≠sticas
    print(f"\nResultado do join:")
    print(f"  - Total de queries: {len(result_table)}")

    if 'item_metadata' in result_table.column_names:
        n_matched = pc.sum(pc.invert(pc.is_null(result_table['item_metadata']))).as_py()
        print(f"  - Queries com metadata: {n_matched}")
        print(f"  - Queries sem metadata: {len(result_table) - n_matched}")

    if 'category' in result_table.column_names:
        n_category = pc.sum(pc.invert(pc.is_null(result_table['category']))).as_py()
        print(f"  - Queries com categoria: {n_category}")

        print(f"\nCategorias encontradas:")
        # Value counts em PyArrow
        category_counts = pc.value_counts(result_table['category'])

        print(f"\nCategorias encontradas:")
        # Value counts em PyArrow - converter para pandas temporariamente
        category_df = result_table.select(['category']).to_pandas()
        category_counts = category_df['category'].value_counts().head(10)

        for cat, count in category_counts.items():
            print(f"  {cat}: {count}")



    print(f"\nColunas finais: {result_table.column_names}")

    # Salvar se necess√°rio
    if output_path:
        print(f"\nSalvando resultado em {output_path}...")
        pq.write_table(result_table, output_path)
        print("Arquivo salvo com sucesso!")

    return result_table


def filter_by_category_arrow(
    table: pa.Table,
    category: str,
    category_col: str = 'category'
) -> pa.Table:
    """
    Filtra a tabela por uma categoria espec√≠fica.

    Args:
        table: PyArrow Table completo
        category: Nome da categoria para filtrar
        category_col: Nome da coluna que cont√©m a categoria

    Returns:
        PyArrow Table filtrado
    """
    if category_col not in table.column_names:
        print(f"AVISO: Coluna '{category_col}' n√£o encontrada na tabela")
        return table

    # Criar m√°scara de filtro
    mask = pc.equal(table[category_col], category)
    filtered_table = table.filter(mask)

    print(f"Categoria '{category}': {len(filtered_table)} queries")
    return filtered_table


def get_unique_categories_arrow(
    table: pa.Table,
    category_col: str = 'category'
) -> list:
    """
    Retorna lista de categorias √∫nicas na tabela.

    Args:
        table: PyArrow Table
        category_col: Nome da coluna de categoria

    Returns:
        Lista de categorias √∫nicas
    """
    if category_col not in table.column_names:
        return []

    unique_cats = pc.unique(table[category_col])
    # Remover nulls
    unique_cats = pc.drop_null(unique_cats)

    return unique_cats.to_pylist()


def load_blair_model(model_name: str = "hyp1231/blair-roberta-base"):
    """
    Carrega o modelo BLAIR e o tokenizer.

    Args:
        model_name: Nome do modelo no Hugging Face

    Returns:
        Tupla (tokenizer, model)
    """
    print(f"Carregando modelo {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Mover para GPU se dispon√≠vel
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    print(f"Modelo carregado em: {device}")
    return tokenizer, model, device


def generate_embeddings_batch(
    texts: List[str],
    tokenizer,
    model,
    device,
    batch_size: int = 32,
    max_length: int = 512
) -> torch.Tensor:
    """
    Gera embeddings para uma lista de textos em batches.

    Args:
        texts: Lista de textos para gerar embeddings
        tokenizer: Tokenizer do BLAIR
        model: Modelo BLAIR
        device: Device (cpu ou cuda)
        batch_size: Tamanho do batch
        max_length: Comprimento m√°ximo dos tokens

    Returns:
        Tensor com embeddings normalizados (N x embedding_dim)
    """
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenizar
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Mover para device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Gerar embeddings
        with torch.no_grad():
            embeddings = model(**inputs, return_dict=True).last_hidden_state[:, 0]
            # Normalizar
            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)

        all_embeddings.append(embeddings.cpu())

    # Concatenar todos os embeddings
    return torch.cat(all_embeddings, dim=0)


def add_embeddings_to_table(
    table: pa.Table,
    query_col: str = 'query',
    item_col: str = 'item_metadata',
    model_name: str = "hyp1231/blair-roberta-base",
    batch_size: int = 32,
    max_length: int = 512
) -> pa.Table:
    """
    Adiciona colunas de embeddings (query_embedding e item_embedding) √† tabela.

    Args:
        table: PyArrow Table com queries e metadados
        query_col: Nome da coluna com o texto da query
        item_col: Nome da coluna com o metadata do item
        model_name: Nome do modelo BLAIR
        batch_size: Tamanho do batch para processamento
        max_length: Comprimento m√°ximo dos tokens

    Returns:
        PyArrow Table com colunas query_embedding e item_embedding adicionadas
    """
    # Carregar modelo
    tokenizer, model, device = load_blair_model(model_name)

    # Converter colunas para listas
    print(f"\nExtraindo textos das colunas '{query_col}' e '{item_col}'...")
    queries = table[query_col].to_pylist()
    items = table[item_col].to_pylist()

    # Filtrar valores None/null
    print("Tratando valores nulos...")
    queries_clean = [q if q is not None else "" for q in queries]
    items_clean = [i if i is not None else "" for i in items]

    # Gerar embeddings para queries
    print(f"\nGerando embeddings para {len(queries_clean)} queries...")
    query_embeddings = generate_embeddings_batch(
        queries_clean,
        tokenizer,
        model,
        device,
        batch_size=batch_size,
        max_length=max_length
    )
    print(f"Query embeddings shape: {query_embeddings.shape}")

    # Gerar embeddings para items
    print(f"\nGerando embeddings para {len(items_clean)} items...")
    item_embeddings = generate_embeddings_batch(
        items_clean,
        tokenizer,
        model,
        device,
        batch_size=batch_size,
        max_length=max_length
    )
    print(f"Item embeddings shape: {item_embeddings.shape}")

    # Converter embeddings para listas Python (para PyArrow)
    query_embeddings_list = query_embeddings.numpy().tolist()
    item_embeddings_list = item_embeddings.numpy().tolist()

    # Criar arrays PyArrow com tipo list
    embedding_dim = query_embeddings.shape[1]
    embedding_type = pa.list_(pa.float32(), embedding_dim)

    query_emb_array = pa.array(query_embeddings_list, type=embedding_type)
    item_emb_array = pa.array(item_embeddings_list, type=embedding_type)

    # Adicionar colunas √† tabela
    print("\nAdicionando colunas de embeddings √† tabela...")
    table = table.append_column('query_embedding', query_emb_array)
    table = table.append_column('item_embedding', item_emb_array)

    print(f"Colunas finais: {table.column_names}")

    return table


def prepare_amazonc4_with_embeddings(
    model_name: str = "hyp1231/blair-roberta-base",
    batch_size: int = 32,
    output_path: Optional[str] = None
) -> pa.Table:
    """
    Pipeline completo: carrega Amazon-C4, adiciona metadados e gera embeddings.

    Args:
        model_name: Nome do modelo BLAIR
        batch_size: Tamanho do batch para gera√ß√£o de embeddings
        output_path: Se fornecido, salva o resultado em Parquet

    Returns:
        PyArrow Table com queries, metadados e embeddings
    """
    # Carregar e unir com metadados
    table = prepare_amazonc4_with_metadata_arrow()

    # Adicionar embeddings
    print("\n" + "="*60)
    print("GERANDO EMBEDDINGS")
    print("="*60)

    table = add_embeddings_to_table(
        table,
        query_col='query',
        item_col='item_metadata',
        model_name=model_name,
        batch_size=batch_size
    )

    # Salvar se necess√°rio
    if output_path:
        print(f"\nSalvando resultado em {output_path}...")
        pq.write_table(table, output_path)
        print("Arquivo salvo com sucesso!")

    return table


In [38]:
def dcg_at_k(relevances: List[float], k: int) -> float:
    """
    Calcula o Discounted Cumulative Gain at k.

    Args:
        relevances: Lista de relev√¢ncias (1 para relevante, 0 para n√£o relevante)
        k: Posi√ß√£o at√© onde calcular o DCG

    Returns:
        DCG@k value
    """
    relevances = np.array(relevances[:k])
    if relevances.size == 0:
        return 0.0

    # DCG = sum(rel_i / log2(i + 1)) for i in [0, k)
    discounts = np.log2(np.arange(2, relevances.size + 2))
    return np.sum(relevances / discounts)


def ndcg_at_k(relevances: List[float], k: int) -> float:
    """
    Calcula o Normalized Discounted Cumulative Gain at k.

    Args:
        relevances: Lista de relev√¢ncias ordenada pelo ranking do modelo
        k: Posi√ß√£o at√© onde calcular o NDCG

    Returns:
        NDCG@k value (entre 0 e 1)
    """
    dcg = dcg_at_k(relevances, k)

    # Ideal DCG: ordenar as relev√¢ncias em ordem decrescente
    ideal_relevances = sorted(relevances, reverse=True)
    idcg = dcg_at_k(ideal_relevances, k)

    if idcg == 0:
        return 0.0

    return dcg / idcg


def sample_candidates_for_query(
    query_idx: int,
    target_category: str,
    table: pa.Table,
    in_domain_indices: List[int],
    out_domain_indices: List[int],
    n_in_domain: int = 50,
    n_out_domain: int = 50,
    random_seed: Optional[int] = None
) -> List[int]:
    """
    Sampling GT-safe:
    - Ground-truth item SEMPRE inclu√≠do
    - (n_in_domain - 1) outros in-domain
    - n_out_domain out-domain
    """

    if random_seed is not None:
        rng = random.Random(random_seed)
    else:
        rng = random

    # üîë Ground-truth item
    gt_idx = query_idx

    # In-domain excluindo o GT
    other_in_domain = [idx for idx in in_domain_indices if idx != gt_idx]

    n_in_to_sample = min(n_in_domain - 1, len(other_in_domain))
    sampled_in_domain = rng.sample(other_in_domain, n_in_to_sample)

    # Out-domain
    n_out_to_sample = min(n_out_domain, len(out_domain_indices))
    sampled_out_domain = rng.sample(out_domain_indices, n_out_to_sample)

    # Pool final
    candidate_indices = [gt_idx] + sampled_in_domain + sampled_out_domain

    return candidate_indices




def calculate_ndcg_for_category_sample(
    table_with_embeddings: pa.Table,
    category: str,
    n_samples: int = 200,
    k: int = 100,
    random_seed: int = 42
) -> Dict:
    """
    Calcula NDCG@k seguindo exatamente o setup do paper BLAIR:
    - In-domain items: relev√¢ncia = 1
    - Out-domain items: relev√¢ncia = 0
    """

    rng = random.Random(random_seed)

    print(f"\n{'='*60}")
    print(f"Avaliando categoria (BLAIR setup): {category}")
    print(f"{'='*60}")

    # √çndices in-domain (mesma categoria)
    category_mask = pc.equal(table_with_embeddings['category'], category)
    in_domain_indices = pc.indices_nonzero(category_mask).to_pylist()

    # √çndices out-domain
    other_mask = pc.not_equal(table_with_embeddings['category'], category)
    out_domain_indices = pc.indices_nonzero(other_mask).to_pylist()

    print(f"In-domain queries: {len(in_domain_indices)}")
    print(f"Out-domain queries: {len(out_domain_indices)}")

    # Amostrar queries da categoria
    n_to_sample = min(n_samples, len(in_domain_indices))
    sampled_query_indices = rng.sample(in_domain_indices, n_to_sample)

    # Extrair embeddings
    query_embeddings = np.array(table_with_embeddings['query_embedding'].to_pylist())
    item_embeddings = np.array(table_with_embeddings['item_embedding'].to_pylist())

    ndcg_scores = []

    print("\nCalculando NDCG@k...")
    for query_idx in sampled_query_indices:

        # Amostrar candidatos (50 in-domain + 50 out-domain)
        candidate_indices = sample_candidates_for_query(
            query_idx=query_idx,
            target_category=category,
            table=table_with_embeddings,
            in_domain_indices=in_domain_indices,
            out_domain_indices=out_domain_indices,
            n_in_domain=50,
            n_out_domain=50,
            random_seed=random_seed + query_idx
        )

        if len(candidate_indices) == 0:
            continue

        # Embedding da query
        query_emb = query_embeddings[query_idx]

        # Embeddings dos candidatos
        candidate_item_embs = item_embeddings[candidate_indices]

        # Similaridade cosseno (embeddings j√° normalizados)
        similarities = np.dot(candidate_item_embs, query_emb)

        # Ranking
        ranked_positions = np.argsort(similarities)[::-1]

        # Relev√¢ncia BLAIR:
        # 1 se item √© in-domain, 0 se out-domain
        relevances = [
            1.0 if candidate_indices[pos] in in_domain_indices else 0.0
            for pos in ranked_positions
        ]

        ndcg = ndcg_at_k(relevances, k)
        ndcg_scores.append(ndcg)

    avg_ndcg = float(np.mean(ndcg_scores)) if ndcg_scores else 0.0

    print(f"\n{'='*60}")
    print(f"Resultados ({category})")
    print(f"NDCG@{k}: {avg_ndcg:.4f}")
    print(f"Queries avaliadas: {len(ndcg_scores)}")
    print(f"{'='*60}")

    return {
        "category": category,
        f"ndcg@{k}": avg_ndcg,
        "n_queries": len(ndcg_scores),
        "ndcg_scores": ndcg_scores,
        "min_ndcg": float(np.min(ndcg_scores)) if ndcg_scores else 0.0,
        "max_ndcg": float(np.max(ndcg_scores)) if ndcg_scores else 0.0,
        "std_ndcg": float(np.std(ndcg_scores)) if ndcg_scores else 0.0,
    }


def evaluate_all_categories(
    table_with_embeddings: pa.Table,
    n_samples_per_category: int = 200,
    k: int = 100,
    random_seed: int = 42,
    categories_to_eval: Optional[List[str]] = None
) -> Dict:
    """
    Avalia NDCG@k para todas (ou algumas) categorias.

    Args:
        table_with_embeddings: Table com embeddings
        n_samples_per_category: N√∫mero de queries para amostrar por categoria
        k: K para NDCG@k
        random_seed: Seed para reprodutibilidade
        categories_to_eval: Lista de categorias para avaliar (None = todas)

    Returns:
        Dicion√°rio com resultados por categoria e m√©dia geral
    """
    # Obter categorias √∫nicas
    if categories_to_eval is None:
        categories = get_unique_categories_arrow(table_with_embeddings)
    else:
        categories = categories_to_eval

    print(f"\n{'='*60}")
    print(f"AVALIA√á√ÉO DE {len(categories)} CATEGORIAS")
    print(f"{'='*60}")
    print(f"Categorias: {categories}")

    results = {}
    all_ndcg_scores = []

    for category in categories:
        cat_results = calculate_ndcg_for_category_sample(
            table_with_embeddings=table_with_embeddings,
            category=category,
            n_samples=n_samples_per_category,
            k=k,
            random_seed=random_seed
        )

        results[category] = cat_results
        all_ndcg_scores.append(cat_results[f'ndcg@{k}'])

    # Calcular m√©dia geral
    overall_ndcg = np.mean(all_ndcg_scores) if all_ndcg_scores else 0.0

    print(f"\n{'='*60}")
    print(f"RESULTADOS FINAIS")
    print(f"{'='*60}")
    print(f"NDCG@{k} m√©dio geral: {overall_ndcg:.4f}")
    print(f"\nPor categoria:")
    for cat, res in results.items():
        print(f"  {cat}: {res[f'ndcg@{k}']:.4f}")
    print(f"{'='*60}")

    results['overall'] = {
        f'ndcg@{k}': overall_ndcg,
        'n_categories': len(categories)
    }

    return results


In [5]:
# Testando a logica
result_table = prepare_amazonc4_with_metadata_arrow(
    output_path='amazonc4_with_metadata.parquet'
)

# Ver as primeiras linhas
print("\nPrimeiras 3 queries completas:")
cols_to_show = ['user_id', 'parent_asin', 'category', 'item_metadata']
available_cols = [c for c in cols_to_show if c in result_table.column_names]
print(result_table.select(available_cols).slice(0, 3).to_pandas())


# # Salvar categoria espec√≠fica
pq.write_table(result_table, 'amazonc4_with_embeddings.parquet')

Carregando Amazon-C4 do Hugging Face...


README.md: 0.00B [00:00, ?B/s]

test.csv:   0%|          | 0.00/12.5M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/21223 [00:00<?, ? examples/s]

Amazon-C4 carregado: 21223 queries
Colunas: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review']
Schema: qid: int64
query: string
item_id: string
user_id: string
ori_rating: int64
ori_review: string
-- schema metadata --
huggingface: '{"info": {"features": {"qid": {"dtype": "int64", "_type": "' + 263

Baixando sampled_item_metadata_1M.jsonl do Hugging Face...


sampled_item_metadata_1M.jsonl:   0%|          | 0.00/643M [00:00<?, ?B/s]

Arquivo baixado em: /root/.cache/huggingface/hub/datasets--McAuley-Lab--Amazon-C4/snapshots/39322697749a88d179f88d322a2fe4765b655c98/sampled_item_metadata_1M.jsonl
Carregando metadados dos itens...
Metadados carregados: 1058417 itens
Colunas: ['item_id', 'category', 'metadata']
Schema: item_id: string
category: string
metadata: string

Fazendo join entre queries e metadados...

Resultado do join:
  - Total de queries: 21223
  - Queries com metadata: 21223
  - Queries sem metadata: 0
  - Queries com categoria: 21223

Categorias encontradas:

Categorias encontradas:
  Home: 3416
  Clothing: 2867
  Electronics: 1546
  Tools: 1445
  Household: 1328
  Care: 1314
  Kindle: 1112
  Pet: 927
  Garden: 841
  Automotive: 833

Colunas finais: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review', 'item_metadata', 'category']

Salvando resultado em amazonc4_with_metadata.parquet...
Arquivo salvo com sucesso!

Primeiras 3 queries completas:
                        user_id category  \
0  

In [6]:
# Adicionar apenas embeddings
table_with_embeddings = add_embeddings_to_table(
    result_table,
    query_col='query',
    item_col='item_metadata',
    batch_size=32
)

# Salvar
pq.write_table(table_with_embeddings, 'amazonc4_games_with_embeddings.parquet')

Carregando modelo hyp1231/blair-roberta-base...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Modelo carregado em: cuda

Extraindo textos das colunas 'query' e 'item_metadata'...
Tratando valores nulos...

Gerando embeddings para 21223 queries...
Query embeddings shape: torch.Size([21223, 768])

Gerando embeddings para 21223 items...
Item embeddings shape: torch.Size([21223, 768])

Adicionando colunas de embeddings √† tabela...
Colunas finais: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review', 'item_metadata', 'category', 'query_embedding', 'item_embedding']


## 3.2.3 - Calculate NDCG for queries in amazon c4

In [7]:
# Avaliar uma categoria espec√≠fica

results = calculate_ndcg_for_category_sample(
    table_with_embeddings=table_with_embeddings,
    category='Electronics',
    n_samples=500,
    k=100,
    random_seed=42
)


Avaliando categoria (BLAIR setup): Electronics
In-domain queries: 1546
Out-domain queries: 19677

Calculando NDCG@k...

Resultados (Electronics)
NDCG@100: 0.8687
Queries avaliadas: 500


## 3.2.4 - Calculate hard negative queries for category

In [8]:
def load_hard_negative_queries(
    category: str,
    base_path: str = "/content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct"
) -> pa.Table:
    """
    Carrega o dataframe de hard negative queries para uma categoria.

    Args:
        category: Nome da categoria
        base_path: Caminho base onde est√£o os arquivos

    Returns:
        PyArrow Table com hard negative queries
    """
    filepath = f"{base_path}/{category}_part_0.parquet"
    print(f"Carregando hard negative queries de: {filepath}")

    table = pq.read_table(filepath)
    print(f"Hard negative queries carregadas: {len(table)} queries")
    print(f"Colunas: {table.column_names}")

    return table

def filter_valid_hard_negatives(
    hard_neg_table: pa.Table,
    amazonc4_with_metadata: pa.Table,
    tokenizer,
    model,
    device,
    n_candidate_items: int = 49,
    batch_size: int = 32,
    random_seed: int = 42
) -> pa.Table:
    """
    Filtra hard negative queries v√°lidas, mantendo apenas aquelas onde
    o modelo rankeia um item diferente do original em 1¬∫ lugar.

    Args:
        hard_neg_table: Table com hard negative queries (deve ter coluna 'item_id')
        amazonc4_with_metadata: Table do Amazon-C4 com metadados
        tokenizer: Tokenizer do BLAIR
        model: Modelo BLAIR
        device: Device (cpu/cuda)
        n_candidate_items: N√∫mero de itens candidatos para comparar (al√©m do original)
        batch_size: Tamanho do batch para embeddings
        random_seed: Seed para reprodutibilidade

    Returns:
        PyArrow Table filtrado com colunas:
        - ground_true_item_id (original item_id)
        - hard_negative_query
        - category
        - ori_semi_positive_review
    """
    rng = random.Random(random_seed)

    print(f"\n{'='*60}")
    print(f"Filtrando hard negatives v√°lidas")
    print(f"{'='*60}")

    # Converter para pandas para facilitar opera√ß√µes
    hard_neg_df = hard_neg_table.to_pandas()
    amazonc4_df = amazonc4_with_metadata.to_pandas()

    print(f"Total de hard negative queries: {len(hard_neg_df)}")
    print(f"Total de itens no Amazon-C4: {len(amazonc4_df)}")

    # Fazer join com Amazon-C4 para obter metadados e categoria dos itens originais
    print("\nFazendo join com Amazon-C4...")
    hard_neg_with_metadata = hard_neg_df.merge(
        amazonc4_df[['item_id', 'category', 'item_metadata']],
        left_on='item_id',
        right_on='item_id',
        how='left'
    )

    # Remover registros sem match
    hard_neg_with_metadata = hard_neg_with_metadata.dropna(subset=['category', 'item_metadata'])
    print(f"Queries ap√≥s join: {len(hard_neg_with_metadata)}")

    # Listas para armazenar queries v√°lidas
    valid_queries = []
    valid_item_ids = []
    valid_categories = []
    valid_ori_reviews = []

    print("\nProcessando hard negative queries...")

    for idx, row in hard_neg_with_metadata.iterrows():
        original_item_id = row['item_id']
        hard_query = row['hard_negative_query']
        ori_semi_positive = row['ori_semi_positive_review']
        item_category = row['category']

        # Verificar se temos a query
        if pd.isna(hard_query) or hard_query == "":
            continue

        # Criar candidate pool: filtrar itens da mesma categoria
        category_items = amazonc4_df[amazonc4_df['category'] == item_category]

        # Amostrar n_candidate_items itens da categoria (diferentes do original)
        available_items = category_items[category_items['item_id'] != original_item_id]

        if len(available_items) < n_candidate_items:
            # N√£o temos itens suficientes, pular
            continue

        sampled_items = available_items.sample(n=n_candidate_items, random_state=random_seed + idx)

        # Adicionar o item original aos candidatos
        original_item_row = category_items[category_items['item_id'] == original_item_id]

        if len(original_item_row) == 0:
            # Item original n√£o encontrado, pular
            continue

        # Combinar: item original + itens amostrados = candidate pool (50 itens)
        candidate_pool = pd.concat([original_item_row, sampled_items], ignore_index=True)

        # Obter metadados dos candidatos
        candidate_metadatas = candidate_pool['item_metadata'].tolist()
        candidate_item_ids = candidate_pool['item_id'].tolist()

        # Gerar embeddings para a hard query
        query_inputs = tokenizer(
            [hard_query],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        query_inputs = {k: v.to(device) for k, v in query_inputs.items()}

        with torch.no_grad():
            query_emb = model(**query_inputs, return_dict=True).last_hidden_state[:, 0]
            query_emb = query_emb / query_emb.norm(dim=1, keepdim=True)
            query_emb = query_emb.cpu().numpy()[0]

        # Gerar embeddings para os itens candidatos
        item_embeddings = generate_embeddings_batch(
            candidate_metadatas,
            tokenizer,
            model,
            device,
            batch_size=batch_size
        )
        item_embeddings = item_embeddings.numpy()

        # Calcular cosine similarity
        similarities = np.dot(item_embeddings, query_emb)

        # Encontrar o item com maior similaridade (rank 1)
        top_item_idx = np.argmax(similarities)
        top_item_id = candidate_item_ids[top_item_idx]

        # Verificar se o top item √© DIFERENTE do original (hard negative v√°lida)
        if top_item_id != original_item_id:
            # Esta √© uma hard negative v√°lida!
            valid_queries.append(hard_query)
            valid_item_ids.append(original_item_id)
            valid_categories.append(item_category)
            valid_ori_reviews.append(ori_semi_positive)

    print(f"\n{'='*60}")
    print(f"Resultados da filtragem:")
    print(f"  Total processadas: {len(hard_neg_with_metadata)}")
    print(f"  Hard negatives v√°lidas: {len(valid_queries)}")
    if len(hard_neg_with_metadata) > 0:
        print(f"  Taxa de valida√ß√£o: {len(valid_queries)/len(hard_neg_with_metadata)*100:.2f}%")
    print(f"{'='*60}")

    # Criar nova tabela com queries v√°lidas
    valid_table = pa.table({
        'ground_true_item_id': valid_item_ids,
        'hard_negative_query': valid_queries,
        'category': valid_categories,
        'ori_semi_positive_review': valid_ori_reviews
    })

    return valid_table


def process_hard_negatives_for_category(
    category: str,
    amazonc4_with_metadata: pa.Table,
    model_name: str = "hyp1231/blair-roberta-base",
    hard_neg_base_path: str = "/content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct",
    output_path: Optional[str] = None,
    n_candidate_items: int = 49,
    batch_size: int = 32,
    random_seed: int = 42
) -> pa.Table:
    """
    Pipeline completo para processar hard negatives de uma categoria.

    Args:
        category: Nome da categoria
        amazonc4_with_metadata: Table do Amazon-C4 com metadados
        model_name: Nome do modelo BLAIR
        hard_neg_base_path: Caminho base dos arquivos de hard negatives
        output_path: Caminho para salvar resultado (opcional)
        n_candidate_items: N√∫mero de itens candidatos
        batch_size: Tamanho do batch
        random_seed: Seed para reprodutibilidade

    Returns:
        PyArrow Table com hard negatives v√°lidas
    """
    # Carregar hard negative queries
    hard_neg_table = load_hard_negative_queries(category, hard_neg_base_path)

    # Carregar modelo
    tokenizer, model, device = load_blair_model(model_name)

    # Filtrar queries v√°lidas
    valid_table = filter_valid_hard_negatives(
        hard_neg_table=hard_neg_table,
        amazonc4_with_metadata=amazonc4_with_metadata,
        tokenizer=tokenizer,
        model=model,
        device=device,
        n_candidate_items=n_candidate_items,
        batch_size=batch_size,
        random_seed=random_seed
    )

    # Salvar se necess√°rio
    if output_path:
        print(f"\nSalvando resultado em {output_path}...")
        pq.write_table(valid_table, output_path)
        print("Arquivo salvo com sucesso!")

    return valid_table

In [9]:
dfs_hard_neg = []
categories = ["Automotive","Electronics","Beauty_and_Personal_Care","Books","Clothing_Shoes_and_Jewelry"]
for cat in categories:
    print(f"Processando categoria {cat}...")
    hf_hub_download(
        filename=f"datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/{cat}_part_0.parquet",
        repo_id="Talissa/AmazonC4Augmented",
        repo_type="dataset",
        local_dir="/content/data",
        token=hf_token)
    valid_hard_neg = process_hard_negatives_for_category(
        category=f'{cat}',
        amazonc4_with_metadata=result_table,
        output_path=f'valid_hard_negatives_{cat}.parquet'
    )

    dfs_hard_neg.append(valid_hard_neg)

Processando categoria Automotive...


datasets/procesed/hard_negative_queries/(‚Ä¶):   0%|          | 0.00/493k [00:00<?, ?B/s]

Carregando hard negative queries de: /content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/Automotive_part_0.parquet
Hard negative queries carregadas: 1075 queries
Colunas: ['item_id', 'ori_semi_positive_review', 'ori_positive_review', 'positive_points', 'negative_points', 'hard_negative_query']
Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Filtrando hard negatives v√°lidas
Total de hard negative queries: 1075
Total de itens no Amazon-C4: 21223

Fazendo join com Amazon-C4...
Queries ap√≥s join: 1102

Processando hard negative queries...

Resultados da filtragem:
  Total processadas: 1102
  Hard negatives v√°lidas: 384
  Taxa de valida√ß√£o: 34.85%

Salvando resultado em valid_hard_negatives_Automotive.parquet...
Arquivo salvo com sucesso!
Processando categoria Electronics...


datasets/procesed/hard_negative_queries/(‚Ä¶):   0%|          | 0.00/264k [00:00<?, ?B/s]

Carregando hard negative queries de: /content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/Electronics_part_0.parquet
Hard negative queries carregadas: 585 queries
Colunas: ['item_id', 'ori_semi_positive_review', 'ori_positive_review', 'positive_points', 'negative_points', 'hard_negative_query']
Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Filtrando hard negatives v√°lidas
Total de hard negative queries: 585
Total de itens no Amazon-C4: 21223

Fazendo join com Amazon-C4...
Queries ap√≥s join: 615

Processando hard negative queries...

Resultados da filtragem:
  Total processadas: 615
  Hard negatives v√°lidas: 206
  Taxa de valida√ß√£o: 33.50%

Salvando resultado em valid_hard_negatives_Electronics.parquet...
Arquivo salvo com sucesso!
Processando categoria Beauty_and_Personal_Care...


datasets/procesed/hard_negative_queries/(‚Ä¶):   0%|          | 0.00/205k [00:00<?, ?B/s]

Carregando hard negative queries de: /content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/Beauty_and_Personal_Care_part_0.parquet
Hard negative queries carregadas: 498 queries
Colunas: ['item_id', 'ori_semi_positive_review', 'ori_positive_review', 'positive_points', 'negative_points', 'hard_negative_query']
Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Filtrando hard negatives v√°lidas
Total de hard negative queries: 498
Total de itens no Amazon-C4: 21223

Fazendo join com Amazon-C4...
Queries ap√≥s join: 534

Processando hard negative queries...

Resultados da filtragem:
  Total processadas: 534
  Hard negatives v√°lidas: 177
  Taxa de valida√ß√£o: 33.15%

Salvando resultado em valid_hard_negatives_Beauty_and_Personal_Care.parquet...
Arquivo salvo com sucesso!
Processando categoria Books...


datasets/procesed/hard_negative_queries/(‚Ä¶):   0%|          | 0.00/583k [00:00<?, ?B/s]

Carregando hard negative queries de: /content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/Books_part_0.parquet
Hard negative queries carregadas: 1037 queries
Colunas: ['item_id', 'ori_semi_positive_review', 'ori_positive_review', 'positive_points', 'negative_points', 'hard_negative_query']
Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Filtrando hard negatives v√°lidas
Total de hard negative queries: 1037
Total de itens no Amazon-C4: 21223

Fazendo join com Amazon-C4...
Queries ap√≥s join: 1049

Processando hard negative queries...

Resultados da filtragem:
  Total processadas: 1049
  Hard negatives v√°lidas: 453
  Taxa de valida√ß√£o: 43.18%

Salvando resultado em valid_hard_negatives_Books.parquet...
Arquivo salvo com sucesso!
Processando categoria Clothing_Shoes_and_Jewelry...


datasets/procesed/hard_negative_queries/(‚Ä¶):   0%|          | 0.00/135k [00:00<?, ?B/s]

Carregando hard negative queries de: /content/data/datasets/procesed/hard_negative_queries/Qwen/Qwen2.5-3B-Instruct/Clothing_Shoes_and_Jewelry_part_0.parquet
Hard negative queries carregadas: 378 queries
Colunas: ['item_id', 'ori_semi_positive_review', 'ori_positive_review', 'positive_points', 'negative_points', 'hard_negative_query']
Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Filtrando hard negatives v√°lidas
Total de hard negative queries: 378
Total de itens no Amazon-C4: 21223

Fazendo join com Amazon-C4...
Queries ap√≥s join: 381

Processando hard negative queries...

Resultados da filtragem:
  Total processadas: 381
  Hard negatives v√°lidas: 176
  Taxa de valida√ß√£o: 46.19%

Salvando resultado em valid_hard_negatives_Clothing_Shoes_and_Jewelry.parquet...
Arquivo salvo com sucesso!


## 3.2.5 - NDCG with hard negative

In [10]:
def merge_hard_negatives_with_amazonc4(
    valid_hard_negatives: pa.Table,
    amazonc4_with_metadata: pa.Table
) -> pa.Table:
    """
    Appends hard negative queries as new rows to Amazon-C4,
    keeping schema alignment and staying Arrow-native.
    """

    # Build a lookup: item_id -> item_metadata
    asin_to_metadata = dict(
        zip(
            amazonc4_with_metadata["item_id"].to_pylist(),
            amazonc4_with_metadata["item_metadata"].to_pylist()
        )
    )

    # Extract hard negative columns
    hn_queries = valid_hard_negatives["hard_negative_query"].to_pylist()
    hn_item_ids = valid_hard_negatives["ground_true_item_id"].to_pylist()
    hn_categories = valid_hard_negatives["category"].to_pylist()

    # Build new rows aligned with Amazon-C4 schema
    new_rows = {
        "query": [],
        "item_id": [],
        "category": [],
        "item_metadata": []
    }

    for q, item_id, cat in zip(hn_queries, hn_item_ids, hn_categories):
        metadata = asin_to_metadata.get(item_id)
        if metadata is None:
            continue  # skip if item not found

        new_rows["query"].append(q)
        new_rows["item_id"].append(item_id)
        new_rows["category"].append(cat)
        new_rows["item_metadata"].append(metadata)

    # Create Arrow table for hard negatives
    hard_neg_table = pa.table(new_rows)

    # Concatenate (no schema mismatch)
    combined_table = pa.concat_tables(
        [amazonc4_with_metadata, hard_neg_table],
        promote=True
    )

    return combined_table


In [11]:
def prepare_combined_dataset_for_evaluation(
    valid_hard_negatives: pa.Table,
    amazonc4_with_metadata: pa.Table,
    model_name: str = "hyp1231/blair-roberta-base",
    batch_size: int = 32,
    output_path: Optional[str] = None
) -> pa.Table:
    combined_table = merge_hard_negatives_with_amazonc4(
        valid_hard_negatives,
        amazonc4_with_metadata
    )

    combined_with_emb = add_embeddings_to_table(
        combined_table,
        query_col="query",
        item_col="item_metadata",
        model_name=model_name,
        batch_size=batch_size
    )

    if output_path:
        pq.write_table(combined_with_emb, output_path)

    return combined_with_emb

In [12]:
def evaluate_hard_negatives_impact(
    combined_table_with_embeddings: pa.Table,
    category: str,
    n_samples: int = 1000,
    k: int = 100,
    random_seed: int = 42
) -> Dict:
    print("\nAvalia√ß√£o ORIGINAL")
    original_results = calculate_ndcg_for_category_sample(
        combined_table_with_embeddings,
        category=category,
        n_samples=n_samples,
        k=k,
        random_seed=random_seed
    )

    print("\nAvalia√ß√£o com HARD NEGATIVES")
    hard_neg_results = calculate_ndcg_for_category_sample(
        combined_table_with_embeddings,
        category=category,
        n_samples=n_samples,
        k=k,
        random_seed=random_seed + 1
    )

    drop = (
        original_results[f"ndcg@{k}"] -
        hard_neg_results[f"ndcg@{k}"]
    )

    return {
        f"original_ndcg@{k}": original_results[f"ndcg@{k}"],
        f"hard_negative_ndcg@{k}": hard_neg_results[f"ndcg@{k}"],
        f"ndcg_drop_percentage": 100 * drop / max(original_results[f"ndcg@{k}"], 1e-6)
    }


In [44]:
# 3. Mesclar hard negatives com Amazon-C4
k_eval = 10
combined_table = merge_hard_negatives_with_amazonc4(
  valid_hard_negatives=valid_hard_neg,
  amazonc4_with_metadata=result_table
)

# 4. Adicionar embeddings ao dataset combinado
combined_with_emb = prepare_combined_dataset_for_evaluation(
  valid_hard_negatives=valid_hard_neg,
  amazonc4_with_metadata=result_table,
  output_path='combined_dataset.parquet'
)
result_table = f"/content/amazonc4_with_metadata.parquet"
result_table = pq.read_table(result_table)
# 5. Avaliar impacto das hard negatives
cat_list = ['Automotive','Electronics','Beauty_and_Personal_Care','Books','Clothing_Shoes_and_Jewelry']
cat_list_treated = ['Automotive','Electronics','Care','Books','Clothing']
for cat,cat_treated in zip(cat_list,cat_list_treated):

  valid_hard_neg_path = f"/content/valid_hard_negatives_{cat}.parquet"
  valid_hard_neg = pq.read_table(valid_hard_neg_path)


  combined_table = merge_hard_negatives_with_amazonc4(
  valid_hard_negatives=valid_hard_neg,
  amazonc4_with_metadata=result_table
  )


  combined_with_emb = prepare_combined_dataset_for_evaluation(
    valid_hard_negatives=valid_hard_neg,
    amazonc4_with_metadata=result_table,
    output_path='combined_dataset.parquet'
  )

  results = evaluate_hard_negatives_impact(
    combined_table_with_embeddings=combined_with_emb,
    category=f"{cat_treated}",
    n_samples=500,
    k = k_eval
  )

print(f"Calculo do NDCG@{k_eval}")
print(f"NDCG Original: {results[f'original_ndcg@{k_eval}']:.4f}")
print(f"NDCG Hard Neg: {results[f'hard_negative_ndcg@{k_eval}']:.4f}")
print(f"Queda: {results['ndcg_drop_percentage']:.2f}%")

  combined_table = merge_hard_negatives_with_amazonc4(
  combined_table = merge_hard_negatives_with_amazonc4(


Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Extraindo textos das colunas 'query' e 'item_metadata'...
Tratando valores nulos...

Gerando embeddings para 21399 queries...
Query embeddings shape: torch.Size([21399, 768])

Gerando embeddings para 21399 items...
Item embeddings shape: torch.Size([21399, 768])

Adicionando colunas de embeddings √† tabela...
Colunas finais: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review', 'item_metadata', 'category', 'query_embedding', 'item_embedding']


  combined_table = merge_hard_negatives_with_amazonc4(


Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Extraindo textos das colunas 'query' e 'item_metadata'...
Tratando valores nulos...

Gerando embeddings para 21607 queries...
Query embeddings shape: torch.Size([21607, 768])

Gerando embeddings para 21607 items...
Item embeddings shape: torch.Size([21607, 768])

Adicionando colunas de embeddings √† tabela...
Colunas finais: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review', 'item_metadata', 'category', 'query_embedding', 'item_embedding']

Avalia√ß√£o ORIGINAL

Avaliando categoria (BLAIR setup): Automotive
In-domain queries: 1217
Out-domain queries: 20390

Calculando NDCG@k...

Resultados (Automotive)
NDCG@10: 0.7855
Queries avaliadas: 500

Avalia√ß√£o com HARD NEGATIVES

Avaliando categoria (BLAIR setup): Automotive
In-domain queries: 1217
Out-domain queries: 20390

Calculando NDCG@k...

Resultados (Automotive)
NDCG@10: 0.7822
Queries avaliadas: 500
Carregando modelo hyp1231/blair-roberta-base.

## 3.2.6 - Precision@1 para hard e not hard queries

In [14]:
def precision_at_k(relevances: List[float], k: int) -> float:
    """
    Calcula Precision@k.

    Args:
        relevances: Lista bin√°ria de relev√¢ncias ordenada pelo ranking do modelo
        k: cutoff

    Returns:
        Precision@k
    """
    if not relevances:
        return 0.0

    k = min(k, len(relevances))
    return float(np.sum(relevances[:k]) / k)


In [15]:
def calculate_precision_at_k_for_category_sample(
    table_with_embeddings: pa.Table,
    category: str,
    n_samples: int = 200,
    k: int = 1,
    random_seed: int = 42
) -> Dict:
    """
    Calcula Precision@k seguindo o setup do paper BLAIR.

    - In-domain items: relev√¢ncia = 1
    - Out-domain items: relev√¢ncia = 0
    - Sampling: 50 in-domain + 50 out-domain
    """

    rng = random.Random(random_seed)

    print(f"\n{'='*60}")
    print(f"Avaliando Precision@{k} - Categoria: {category}")
    print(f"{'='*60}")

    # In-domain
    category_mask = pc.equal(table_with_embeddings["category"], category)
    in_domain_indices = pc.indices_nonzero(category_mask).to_pylist()

    # Out-domain
    other_mask = pc.not_equal(table_with_embeddings["category"], category)
    out_domain_indices = pc.indices_nonzero(other_mask).to_pylist()

    print(f"In-domain: {len(in_domain_indices)}")
    print(f"Out-domain: {len(out_domain_indices)}")

    # Amostrar queries
    n_to_sample = min(n_samples, len(in_domain_indices))
    sampled_query_indices = rng.sample(in_domain_indices, n_to_sample)

    # Embeddings
    item_embeddings = np.asarray(
        table_with_embeddings["item_embedding"].to_pylist(),
        dtype=np.float32
    )
    query_embeddings = np.asarray(
        table_with_embeddings["query_embedding"].to_pylist(),
        dtype=np.float32
    )

    precision_scores = []

    print("\nCalculando Precision@k...")
    for query_idx in sampled_query_indices:

        candidate_indices = sample_candidates_for_query(
            query_idx=query_idx,
            target_category=category,
            table=table_with_embeddings,
            in_domain_indices=in_domain_indices,
            out_domain_indices=out_domain_indices,
            n_in_domain=50,
            n_out_domain=50,
            random_seed=random_seed + query_idx
        )

        if not candidate_indices:
            continue

        query_emb = query_embeddings[query_idx]


        paired = []
        for idx in candidate_indices:


            if isinstance(idx, (list, tuple)):
                if not idx:
                    continue
                idx = idx[0]

            if not isinstance(idx, int):
                continue

            if 0 <= idx < len(item_embeddings):
                paired.append((idx, item_embeddings[idx]))

        if not paired:
            continue

        candidate_indices_clean, candidate_embs = zip(*paired)
        candidate_embs = np.vstack(candidate_embs)

        similarities = candidate_embs @ query_emb

        ranked_positions = np.argsort(similarities)[::-1]

        # Relev√¢ncia BLAIR (id√™ntica √† do NDCG)
        relevances = [
            1.0 if candidate_indices_clean[pos] in in_domain_indices else 0.0
            for pos in ranked_positions
        ]

        precision_k = precision_at_k(relevances, k)
        precision_scores.append(precision_k)

    avg_precision = float(np.mean(precision_scores)) if precision_scores else 0.0

    print(f"\n{'='*60}")
    print(f"Resultados Precision@{k} ({category})")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Queries avaliadas: {len(precision_scores)}")
    print(f"{'='*60}")

    return {
        "category": category,
        f"precision@{k}": avg_precision,
        "n_queries": len(precision_scores),
        "precision_scores": precision_scores,
        "min_precision": float(np.min(precision_scores)) if precision_scores else 0.0,
        "max_precision": float(np.max(precision_scores)) if precision_scores else 0.0,
        "std_precision": float(np.std(precision_scores)) if precision_scores else 0.0,
    }


In [47]:
cat_list_treated = ['Automotive','Electronics','Care','Books','Clothing']
result_table_with_embd = add_embeddings_to_table(
    result_table
)
for cat in cat_list_treated:
  for k in [1, 5,10]:
      res = calculate_precision_at_k_for_category_sample(
          table_with_embeddings=result_table_with_embd,
          category=f"{cat}",
          n_samples=500,
          k=k
      )
      print(f"-- Precision {k}: {res[f'precision@{k}']:.4f}")
      print(f"-- Min: {res['min_precision']:.4f}")
      print(f"-- Max: {res['max_precision']:.4f}")
      print(f"-- Std: {res['std_precision']:.4f}")
      print(f"-- Queries: {res['n_queries']}")
      print(f"-- {'='*60}")


Carregando modelo hyp1231/blair-roberta-base...
Modelo carregado em: cuda

Extraindo textos das colunas 'query' e 'item_metadata'...
Tratando valores nulos...

Gerando embeddings para 21223 queries...
Query embeddings shape: torch.Size([21223, 768])

Gerando embeddings para 21223 items...
Item embeddings shape: torch.Size([21223, 768])

Adicionando colunas de embeddings √† tabela...
Colunas finais: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review', 'item_metadata', 'category', 'query_embedding', 'item_embedding']

Avaliando Precision@1 - Categoria: Automotive
In-domain: 833
Out-domain: 20390

Calculando Precision@k...

Resultados Precision@1 (Automotive)
Precision@1: 0.9380
Queries avaliadas: 500
-- Precision 1: 0.9380
-- Min: 0.0000
-- Max: 1.0000
-- Std: 0.2412
-- Queries: 500

Avaliando Precision@5 - Categoria: Automotive
In-domain: 833
Out-domain: 20390

Calculando Precision@k...

Resultados Precision@5 (Automotive)
Precision@5: 0.8544
Queries avaliadas: 500
-- Prec

## 3.2.7 - Mean Reciprocal Ranking

In [19]:
def reciprocal_rank(relevances):
    """
    Retorna o Reciprocal Rank de uma query.
    relevances: lista bin√°ria ordenada por ranking (1 = relevante)
    """
    for i, rel in enumerate(relevances, start=1):
        if rel > 0:
            return 1.0 / i
    return 0.0

In [25]:
def calculate_mrr_for_category_sample(
    table_with_embeddings: pa.Table,
    category: str,
    n_samples: int = 200,
    random_seed: int = 42
) -> Dict:
    """
    Calcula Mean Reciprocal Rank (MRR) considerando
    o item ground-truth da query (item_id).

    - 1 item relevante por query
    - Candidate pool: 50 in-domain + 50 out-domain
    """

    rng = random.Random(random_seed)

    print(f"\n{'='*60}")
    print(f"Avaliando MRR (GT-aware) - Categoria: {category}")
    print(f"{'='*60}")

    # Queries da categoria
    category_mask = pc.equal(table_with_embeddings["category"], category)
    in_domain_indices = pc.indices_nonzero(category_mask).to_pylist()

    # Itens fora da categoria
    other_mask = pc.not_equal(table_with_embeddings["category"], category)
    out_domain_indices = pc.indices_nonzero(other_mask).to_pylist()

    n_to_sample = min(n_samples, len(in_domain_indices))
    sampled_query_indices = rng.sample(in_domain_indices, n_to_sample)

    # Embeddings
    item_embeddings = np.asarray(
        table_with_embeddings["item_embedding"].to_pylist(),
        dtype=np.float32
    )
    query_embeddings = np.asarray(
        table_with_embeddings["query_embedding"].to_pylist(),
        dtype=np.float32
    )

    # item_ids
    item_ids = table_with_embeddings["item_id"].to_pylist()

    rr_scores = []

    print("\nCalculando MRR (ground truth item)...")
    for query_idx in sampled_query_indices:

        # üîë Ground truth do item
        gt_item_id = item_ids[query_idx]

        candidate_indices = sample_candidates_for_query(
            query_idx=query_idx,
            target_category=category,
            table=table_with_embeddings,
            in_domain_indices=in_domain_indices,
            out_domain_indices=out_domain_indices,
            n_in_domain=50,
            n_out_domain=50,
            random_seed=random_seed + query_idx
        )

        if not candidate_indices:
            continue

        query_emb = query_embeddings[query_idx]

        # Limpeza defensiva
        paired = []
        for idx in candidate_indices:
            if isinstance(idx, (list, tuple)):
                if not idx:
                    continue
                idx = idx[0]

            if isinstance(idx, int) and 0 <= idx < len(item_embeddings):
                paired.append((idx, item_embeddings[idx]))

        if not paired:
            continue

        candidate_indices_clean, candidate_embs = zip(*paired)
        candidate_embs = np.vstack(candidate_embs)

        similarities = candidate_embs @ query_emb
        ranked_positions = np.argsort(similarities)[::-1]

        # üîë Relev√¢ncia baseada no item_id
        relevances = [
            1.0 if item_ids[candidate_indices_clean[pos]] == gt_item_id else 0.0
            for pos in ranked_positions
        ]

        rr = reciprocal_rank(relevances)
        rr_scores.append(rr)

    mrr = float(np.mean(rr_scores)) if rr_scores else 0.0

    print(f"\n{'='*60}")
    print(f"Resultados MRR GT ({category})")
    print(f"MRR: {mrr:.4f}")
    print(f"Queries avaliadas: {len(rr_scores)}")
    print(f"{'='*60}")

    return {
        "category": category,
        "mrr": mrr,
        "n_queries": len(rr_scores),
        "rr_scores": rr_scores,
        "min_rr": float(np.min(rr_scores)) if rr_scores else 0.0,
        "max_rr": float(np.max(rr_scores)) if rr_scores else 0.0,
        "std_rr": float(np.std(rr_scores)) if rr_scores else 0.0,
    }


In [49]:
cat_list_treated = ['Automotive','Electronics','Care','Books','Clothing']
for cat in cat_list_treated:
    res = calculate_mrr_for_category_sample(
        table_with_embeddings=result_table_with_embd,
        category=f"{cat}",
        n_samples=500,
        random_seed=589
    )
    print(f"-- MRR: {res[f'mrr']:.4f}")
    print(f"-- Min: {res['min_rr']:.4f}")
    print(f"-- Max: {res['max_rr']:.4f}")
    print(f"-- Std: {res['std_rr']:.4f}")
    print(f"-- Queries: {res['n_queries']}")
    print(f"-- {'='*60}")



Avaliando MRR (GT-aware) - Categoria: Automotive

Calculando MRR (ground truth item)...

Resultados MRR GT (Automotive)
MRR: 0.8054
Queries avaliadas: 500
-- MRR: 0.8054
-- Min: 0.0123
-- Max: 1.0000
-- Std: 0.3263
-- Queries: 500

Avaliando MRR (GT-aware) - Categoria: Electronics

Calculando MRR (ground truth item)...

Resultados MRR GT (Electronics)
MRR: 0.8339
Queries avaliadas: 500
-- MRR: 0.8339
-- Min: 0.0119
-- Max: 1.0000
-- Std: 0.3069
-- Queries: 500

Avaliando MRR (GT-aware) - Categoria: Care

Calculando MRR (ground truth item)...

Resultados MRR GT (Care)
MRR: 0.7836
Queries avaliadas: 500
-- MRR: 0.7836
-- Min: 0.0130
-- Max: 1.0000
-- Std: 0.3255
-- Queries: 500

Avaliando MRR (GT-aware) - Categoria: Books

Calculando MRR (ground truth item)...

Resultados MRR GT (Books)
MRR: 0.7032
Queries avaliadas: 500
-- MRR: 0.7032
-- Min: 0.0217
-- Max: 1.0000
-- Std: 0.3844
-- Queries: 500

Avaliando MRR (GT-aware) - Categoria: Clothing

Calculando MRR (ground truth item)...

Resu

## 3.4 - Save results

In [None]:
save_results = True
cat_list = ['Automotive','Electronics','Beauty_and_Personal_Care','Books','Clothing_Shoes_and_Jewelry']
model_name = "Qwen/Qwen2.5-3B-Instruct"
if save_results:
  base = pathlib.Path(f"/content/")
  for idx_category, model_name in enumerate(cat_list):
    files = list(base.glob(f"*valid_hard_negatives_{idx_category}*.parquet"))
    for i,f in enumerate(files):
      upload_file(
          path_or_fileobj=f"{f}",
          path_in_repo=f"/datasets/procesed/valid_hard_queries/{model_name}/{idx_category}_part_{i}.parquet",
          repo_id="Talissa/AmazonC4Augmented",
          repo_type="dataset",
      )


In [None]:
# # Force garbage collection to free CPU memory
import gc
gc.collect()

# Clear GPU cache to free up CUDA memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úÖ GPU cache cleared")
else:
    print("CUDA not available")

print("‚úÖ Cleanup complete - model/tokenizer removed from CPU/GPU")

‚úÖ GPU cache cleared
‚úÖ Cleanup complete - model/tokenizer removed from CPU/GPU


In [None]:
# model_gen_name = "Qwen/Qwen2.5-3B-Instruct"
# task_type = "test"
# hf_hub_download(
#     filename=f"datasets/metrics/quality/{model_gen_name}/{task_type}/quality_metrics_per_category.json",
#     repo_id="Talissa/AmazonC4Augmented",
#     local_dir="/content/data",
#     repo_type="dataset",
#     token=hf_token
# )

'/content/data/datasets/metrics/quality/Qwen/Qwen2.5-3B-Instruct/test/quality_metrics_per_category.json'