In [1]:
import pandas as pd
import numpy as np
import re
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from rank_bm25 import BM25Okapi
from typing import List, Dict, Tuple
from collections import defaultdict
import os
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [4]:
df=pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_unspsc_data.csv")
df.head(5)

Unnamed: 0,Segment Code,Segment Name,Family Code,Family Name,Class Code,Class Name,Commodity Code,Commodity Name
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses


In [5]:
df['corpus']=(df["Segment Name"]+" "+df["Family Name"]+" "+df["Class Name"]+" "+df["Commodity Name"]).str.lower()
df["corpus"].head(4)
df["corpus"][0]

'live plant and animal material and accessories and supplies live animals livestock cats'

In [6]:
chunks=[]
for i,row in df.iterrows():
    doc=Document(
        page_content=row["corpus"],
        metadata={
            "Segment Code": row["Segment Code"],
            "Segment Name": row["Segment Name"],
            "Family Code": row["Family Code"],
            "Family Name": row["Family Name"],
            "Class Code": row["Class Code"],
            "Class Name": row["Class Name"],
            "Commodity Code": row["Commodity Code"],
            "Commodity Name": row["Commodity Name"],
        }
    )
    chunks.append(doc)

print(chunks[0])

page_content='live plant and animal material and accessories and supplies live animals livestock cats' metadata={'Segment Code': 10000000, 'Segment Name': 'Live Plant and Animal Material and Accessories and Supplies', 'Family Code': 10100000, 'Family Name': 'Live animals', 'Class Code': 10101500, 'Class Name': 'Livestock', 'Commodity Code': 10101501, 'Commodity Name': 'Cats'}


In [10]:
# vectorstore=FAISS.from_documents(
#     documents=chunks,
#     embedding=embeddings,
# )


In [7]:


# Load vectorstore
loaded_vectorstore = FAISS.load_local(
    "faiss.index",
    embeddings,
    allow_dangerous_deserialization=True
)

print(f" Vectorstore loaded with {len(loaded_vectorstore.docstore._dict)} documents")

✅ Vectorstore loaded with 71502 documents


In [20]:
#  Feature Extraction Module

class ProductFeatureExtractor:
    """Extracts structured attributes from product descriptions"""
    
    def __init__(self):
        self.material_patterns = {
            'stainless_steel': r'\b(ss|stainless steel|304|316)\b',
            'carbon_steel': r'\b(cs|carbon steel|mild steel)\b',
            'pvc': r'\b(pvc|polyvinyl chloride)\b',
            'copper': r'\b(copper|cu)\b',
            'aluminum': r'\b(aluminum|al)\b',
            'galvanized': r'\b(galvanized|hdg|hot dip)\b',
            'brass': r'\b(brass)\b',
        }
        
        self.dimension_pattern = r'(\d+(?:\.\d+)?)\s*(?:x|by|×)\s*(\d+(?:\.\d+)?)'
        self.volume_pattern = r'(\d+(?:\.\d+)?)\s*(?:gal|gallon|liter|l|ml|cc)'
        self.pressure_pattern = r'(\d+(?:\.\d+)?)\s*(?:psi|bar|pa|kpa)'
        self.power_pattern = r'(\d+(?:\.\d+)?)\s*(?:kw|hp|kbtu|btu|w|watt)'
        self.voltage_pattern = r'(\d+(?:\.\d+)?)\s*(?:v|volt|ac|dc)'
    
    def extract_materials(self, text: str) -> dict:
        text_lower = text.lower()
        materials = {}
        for material, pattern in self.material_patterns.items():
            materials[f'has_{material}'] = bool(re.search(pattern, text_lower))
        return materials
    
    def extract_specs(self, text: str) -> dict:
        specs = {}
        
        vol_match = re.search(self.volume_pattern, text, re.IGNORECASE)
        specs['has_volume'] = bool(vol_match)
        specs['volume_value'] = float(vol_match.group(1)) if vol_match else 0.0
        
        pres_match = re.search(self.pressure_pattern, text, re.IGNORECASE)
        specs['has_pressure'] = bool(pres_match)
        specs['pressure_value'] = float(pres_match.group(1)) if pres_match else 0.0
        
        pow_match = re.search(self.power_pattern, text, re.IGNORECASE)
        specs['has_power'] = bool(pow_match)
        specs['power_value'] = float(pow_match.group(1)) if pow_match else 0.0
        
        volt_match = re.search(self.voltage_pattern, text, re.IGNORECASE)
        specs['has_voltage'] = bool(volt_match)
        specs['voltage_value'] = float(volt_match.group(1)) if volt_match else 0.0
        
        return specs
    
    def extract_keywords(self, text: str) -> dict:
        text_lower = text.lower()
        keywords = {
            'is_heating': any(w in text_lower for w in ['heater', 'heating', 'furnace', 'boiler']),
            'is_cooling': any(w in text_lower for w in ['cooler', 'cooling', 'chiller', 'fan']),
            'is_pipe': any(w in text_lower for w in ['pipe', 'piping', 'tubing', 'coupling', 'fitting']),
            'is_valve': any(w in text_lower for w in ['valve', 'vlv', 'gate', 'check']),
            'is_electrical': any(w in text_lower for w in ['electrical', 'wire', 'cable', 'panel']),
            'is_plumbing': any(w in text_lower for w in ['plumbing', 'toilet', 'sink', 'faucet']),
        }
        return keywords
    
    def extract_all(self, text: str) -> np.ndarray:
        materials = self.extract_materials(text)
        specs = self.extract_specs(text)
        keywords = self.extract_keywords(text)
        
        features = {}
        features.update(materials)
        features.update(specs)
        features.update(keywords)
        
        feature_vector = np.array([
            features['has_stainless_steel'],
            features['has_carbon_steel'],
            features['has_pvc'],
            features['has_copper'],
            features['has_aluminum'],
            features['has_galvanized'],
            features['has_brass'],
            features['has_volume'],
            features['volume_value'],
            features['has_pressure'],
            features['pressure_value'],
            features['has_power'],
            features['power_value'],
            features['has_voltage'],
            features['voltage_value'],
            features['is_heating'],
            features['is_cooling'],
            features['is_pipe'],
            features['is_valve'],
            features['is_electrical'],
            features['is_plumbing'],
        ], dtype=np.float32)
        
        return feature_vector

print(" ProductFeatureExtractor loaded")

 ProductFeatureExtractor loaded


In [12]:
# Load training queries
queries_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")
print(f" Loaded queries: {len(queries_df)} items")
print(f"   Columns: {queries_df.columns.tolist()}")

# Load test products
products_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_sample_product_unspsc.csv")
print(f" Loaded products: {len(products_df)} items")
print(f"   Columns: {products_df.columns.tolist()}")

# Preview first row
print("\n📋 Sample Product Description:")
print(products_df.iloc[0]['Original Description'])


print("Checking and fixing df columns...")
print(f"Current columns in df: {df.columns.tolist()}\n")

# Create corpus column if missing
# if 'corpus' not in df.columns:
#     print("Creating 'corpus' column...")
#     df['corpus'] = (
#         df["Segment Name"] + " " + 
#         df["Family Name"] + " " + 
#         df["Class Name"] + " " + 
#         df["Commodity Name"]
#     ).str.lower()
#     print(" Corpus column created successfully!")
#     print(f"\nSample corpus entries:")
#     for i in range(3):
#         print(f"  {i+1}. {df['corpus'].iloc[i][:80]}...")
# else:
#     print(" Corpus column already exists")

# Verify corpus matches vectorstore documents
print(f"\n Statistics:")
print(f"  - Total corpus entries: {len(df['corpus'])}")
print(f"  - Average corpus length: {df['corpus'].str.len().mean():.0f} chars")
print(f"  - Corpus with content: {df['corpus'].notna().sum()}")
df.head()

 Loaded queries: 50 items
   Columns: ['Search Query', 'UNSPSC Commodity Name']
 Loaded products: 100 items
   Columns: ['Original Description', 'UNSPSC Commodity Name']

📋 Sample Product Description:
BW RG2PV75H6X 75 GAL LP GAS POWER VENTED WATER HEATER STANDARD W/SIDE CONNECTIONS FOR SPACE HTG 76KBTU 6YR 70""H X 26""DIA
Checking and fixing df columns...
Current columns in df: ['Segment Code', 'Segment Name', 'Family Code', 'Family Name', 'Class Code', 'Class Name', 'Commodity Code', 'Commodity Name', 'corpus']


 Statistics:
  - Total corpus entries: 71502
  - Average corpus length: 130 chars
  - Corpus with content: 71502


Unnamed: 0,Segment Code,Segment Name,Family Code,Family Name,Class Code,Class Name,Commodity Code,Commodity Name,corpus
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats,live plant and animal material and accessories...
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs,live plant and animal material and accessories...
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink,live plant and animal material and accessories...
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats,live plant and animal material and accessories...
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses,live plant and animal material and accessories...


In [None]:
class DocumentInformedHierarchicalRouter:
    """Uses retrieved documents to constrain hierarchy search"""
    
    def __init__(self, catalog_df):
        self.catalog = catalog_df
        self.build_hierarchy()
    
    def build_hierarchy(self):
        """Build routing structures"""
        # Segment → Families
        self.segment_to_families = {}
        for _, row in self.catalog.iterrows():
            seg = row['Segment Name']
            fam = row['Family Name']
            if seg not in self.segment_to_families:
                self.segment_to_families[seg] = set()
            self.segment_to_families[seg].add(fam)
        
        # Family → Classes
        self.family_to_classes = {}
        for _, row in self.catalog.iterrows():
            fam = row['Family Name']
            cls = row['Class Name']
            if fam not in self.family_to_classes:
                self.family_to_classes[fam] = set()
            self.family_to_classes[fam].add(cls)
        
        # Class → Commodities
        self.class_to_commodities = {}
        for _, row in self.catalog.iterrows():
            cls = row['Class Name']
            comm = row['Commodity Name']
            if cls not in self.class_to_commodities:
                self.class_to_commodities[cls] = set()
            self.class_to_commodities[cls].add(comm)
        
        # Commodity → Full metadata (for reverse lookup)
        self.commodity_to_hierarchy = {}
        for _, row in self.catalog.iterrows():
            comm = row['Commodity Name']
            self.commodity_to_hierarchy[comm] = {
                'segment': row['Segment Name'],
                'family': row['Family Name'],
                'class': row['Class Name']
            }
        
        print(f" Hierarchy built:")
        print(f"   - {len(self.segment_to_families)} segments")
        print(f"   - {len(self.family_to_classes)} families")
        print(f"   - {len(self.class_to_commodities)} classes")
    
    def route_from_documents(self, retrieved_candidates, top_k=5):
        """
        Use top-k retrieved documents to inform hierarchy routing.
        
        Args:
            retrieved_candidates: List of dicts from HybridRetriever with metadata
            top_k: Number of top documents to analyze
        
        Returns:
            Dict with constrained hierarchy paths
        """
        # Extract hierarchy info from top-k documents
        top_docs = retrieved_candidates[:top_k]
        
        segment_counts = {}
        family_counts = {}
        class_counts = {}
        
        for doc in top_docs:
            metadata = doc.get('metadata', {})
            
            segment = metadata.get('Segment Name')
            family = metadata.get('Family Name')
            cls = metadata.get('Class Name')
            
            if segment:
                segment_counts[segment] = segment_counts.get(segment, 0) + 1
            if family:
                family_counts[family] = family_counts.get(family, 0) + 1
            if cls:
                class_counts[cls] = class_counts.get(cls, 0) + 1
        
        # Get top segments (by frequency in retrieved docs)
        top_segments = sorted(segment_counts.items(), key=lambda x: x[1], reverse=True)
        
        # Get families constrained by top segments
        valid_families = set()
        for segment, _ in top_segments:
            valid_families.update(self.segment_to_families.get(segment, set()))
        
        top_families = [(f, family_counts.get(f, 0)) for f in valid_families]
        top_families = sorted(top_families, key=lambda x: x[1], reverse=True)
        
        # Get classes constrained by top families
        valid_classes = set()
        for family, _ in top_families:
            valid_classes.update(self.family_to_classes.get(family, set()))
        
        top_classes = [(c, class_counts.get(c, 0)) for c in valid_classes]
        top_classes = sorted(top_classes, key=lambda x: x[1], reverse=True)
        
        # Get commodities constrained by top classes
        valid_commodities = set()
        for cls, _ in top_classes:
            valid_commodities.update(self.class_to_commodities.get(cls, set()))
        
        return {
            'top_segments': dict(top_segments[:3]),
            'top_families': dict(top_families[:5]),
            'top_classes': dict(top_classes[:5]),
            'valid_commodities': valid_commodities,
            'num_constrained': len(valid_commodities)
        }
    
    def filter_candidates_by_hierarchy(self, candidates, hierarchy_info):
        """
        Boost candidates that fall within hierarchy constraints.
        
        Args:
            candidates: Full candidate list from evaluation
            hierarchy_info: Output from route_from_documents()
        
        Returns:
            Candidates with hierarchy_score added
        """
        valid_commodities = hierarchy_info['valid_commodities']
        
        for candidate in candidates:
            commodity_name = candidate.get('commodity_name', '')
            in_hierarchy = commodity_name in valid_commodities
            
            # Boost score if in hierarchy
            if in_hierarchy:
                candidate['hierarchy_boost'] = 1.3  # 30% boost
            else:
                candidate['hierarchy_boost'] = 1.0
            
            candidate['in_hierarchy'] = in_hierarchy
        
        return candidates

In [19]:
class TextCleaner:
    """Cleans and normalizes product descriptions and queries"""
    
    def __init__(self, mappings=None, remove_arbitrary_alphanum=False):
        self.mappings = mappings or {
            "pvc": "polyvinyl chloride",
            "ss": "stainless steel", 
            "cs": "carbon steel",
            "vlv": "valve",
            "adpt": "adapter",
            "bush": "bushing",
            "cplg": "coupling",
            "ftg": "fitting",
            "tee": "t-junction"
        }
        self.remove_arbitrary_alphanum = remove_arbitrary_alphanum
        self.arbitrary_pattern = re.compile(r'\b[a-z0-9]{8,}\b')
    
    def clean(self, text: str) -> str:
        if not isinstance(text, str):
            return ""
        
        text = text.lower().strip()
        
        # Apply mappings
        for k, v in self.mappings.items():
            text = text.replace(k, v)
        
        # Remove arbitrary patterns if enabled
        if self.remove_arbitrary_alphanum:
            text = self.arbitrary_pattern.sub('', text)
        
        # Remove special characters except spaces and numbers
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text


def convert_docs_to_dicts(results: List[Tuple], include_scores=True) -> List[Dict]:
    """Converts FAISS search results to standardized dicts"""
    converted = []
    for doc, score in results:
        entry = {
            "doc_text": doc.page_content,
            "metadata": doc.metadata,
        }
        if include_scores:
            entry["sem_score"] = float(score)
        converted.append(entry)
    return converted


class HybridRetriever:
    """Combines semantic (FAISS) and lexical (BM25) retrieval"""
    
    def __init__(self, df: pd.DataFrame, vectorstore, embeddings, cleaner=None):
        self.df = df
        self.vectorstore = vectorstore
        self.embeddings = embeddings
        self.cleaner = cleaner or TextCleaner()
        
        # Build BM25 index on corpus
        self.tokenized_docs = [
            self.cleaner.clean(doc).split() 
            for doc in df['corpus'].tolist()
        ]
        
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print(" BM25 index ready")
    
    def retrieve(self, query: str, top_k=50) -> List[Dict]:
        """Retrieve top-k candidates using hybrid search"""
        query_clean = self.cleaner.clean(query)
        
        # 1. Semantic retrieval (FAISS)
        sem_results = self.vectorstore.similarity_search_with_score(query_clean, k=top_k)
        sem_candidates = convert_docs_to_dicts(sem_results, include_scores=True)
        
        # 2. Lexical retrieval (BM25)
        tokenized_query = query_clean.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(-bm25_scores)[:top_k]
        
        # Merge results
        corpus_to_candidate = {c['doc_text']: c for c in sem_candidates}
        
        for idx in top_indices:
            corpus_text = self.df.iloc[idx]['corpus']
            commodity_name = self.df.iloc[idx]['Commodity Name']
            commodity_code = self.df.iloc[idx]['Commodity Code']
            lex_score = float(bm25_scores[idx])
            
            if corpus_text in corpus_to_candidate:
                corpus_to_candidate[corpus_text]['lex_score'] = lex_score
            else:
                corpus_to_candidate[corpus_text] = {
                    'doc_text': corpus_text,
                    'commodity_name': commodity_name,
                    'commodity_code': commodity_code,
                    'metadata': self.df.iloc[idx].to_dict(),
                    'sem_score': 0.0,
                    'lex_score': lex_score
                }
        
        # Ensure all candidates have both scores
        candidates = list(corpus_to_candidate.values())
        for c in candidates:
            c.setdefault('sem_score', 0.0)
            c.setdefault('lex_score', 0.0)
            if 'commodity_name' not in c:
                c['commodity_name'] = c['metadata'].get('Commodity Name', '')
            if 'commodity_code' not in c:
                c['commodity_code'] = c['metadata'].get('Commodity Code', '')
        
        return candidates


class CrossEncoderReranker:
    """Reranks candidates using a cross-encoder model"""
    
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        print(f"✅ Cross-encoder loaded on {self.device}")
    
    def rerank(self, query: str, candidates: List[Dict], top_n=10) -> List[Dict]:
        if not candidates:
            return []
        
        candidate_texts = [c['doc_text'] for c in candidates]
        
        encodings = self.tokenizer(
            [query] * len(candidate_texts),
            candidate_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(self.device)
        
        with torch.no_grad():
            scores = self.model(**encodings).logits.squeeze(-1).cpu().numpy()
        
        for i, cand in enumerate(candidates):
            cand['cross_score'] = float(scores[i])
        
        reranked = sorted(candidates, key=lambda x: x['cross_score'], reverse=True)
        return reranked[:top_n]


class ScoreMerger:
    """Combines hybrid retrieval scores with cross-encoder scores"""
    
    def __init__(self, alpha=0.4, beta=0.6):
        self.alpha = alpha
        self.beta = beta
    
    def normalize_scores(self, scores: np.ndarray) -> np.ndarray:
        scores = np.array(scores, dtype=np.float32)
        if scores.size == 0:
            return scores
        
        min_s, max_s = scores.min(), scores.max()
        if max_s - min_s == 0:
            return np.ones_like(scores)
        
        return (scores - min_s) / (max_s - min_s)
    
    def merge(self, candidates: List[Dict]) -> List[Dict]:
        if not candidates:
            return []
        
        sem_scores = np.array([c.get('sem_score', 0) for c in candidates])
        lex_scores = np.array([c.get('lex_score', 0) for c in candidates])
        cross_scores = np.array([c.get('cross_score', 0) for c in candidates])
        
        hybrid_scores = self.normalize_scores(sem_scores + lex_scores)
        cross_scores_norm = self.normalize_scores(cross_scores)
        
        final_scores = self.alpha * hybrid_scores + self.beta * cross_scores_norm
        
        for i, c in enumerate(candidates):
            c['final_score'] = float(final_scores[i])
        
        return sorted(candidates, key=lambda x: x['final_score'], reverse=True)




# ============================================================================
# FIXED PRODUCT EVALUATOR - Works with Commodity NAME instead of CODE
# ============================================================================

class ProductEvaluator:
    """Evaluates retrieval using commodity NAMES (not codes) with proper hierarchy metrics"""
    
    def __init__(self, products_df: pd.DataFrame, catalog_df: pd.DataFrame, 
                 retriever, reranker, merger, cleaner=None):
        self.products_df = products_df
        self.catalog_df = catalog_df
        self.retriever = retriever
        self.reranker = reranker
        self.merger = merger
        self.cleaner = cleaner or TextCleaner()
        self.hierarchical_router = DocumentInformedHierarchicalRouter(catalog_df)
        
        
        # Build TWO lookups:
        # 1. Commodity Code → Full hierarchy
        self.code_to_hierarchy = {}
        for _, row in catalog_df.iterrows():
            code = row['Commodity Code']
            self.code_to_hierarchy[code] = {
                'Segment Name': row['Segment Name'],
                'Family Name': row['Family Name'],
                'Class Name': row['Class Name'],
                'Commodity Name': row['Commodity Name']
            }
        
        # 2. Commodity Name → Commodity Code (for reverse lookup)
        self.name_to_code = {}
        for _, row in catalog_df.iterrows():
            name = row['Commodity Name'].strip().lower()
            code = row['Commodity Code']
            self.name_to_code[name] = code
        
        print(f"✅ Built lookups:")
        print(f"   - Code → Hierarchy: {len(self.code_to_hierarchy)} entries")
        print(f"   - Name → Code: {len(self.name_to_code)} entries")
        print(f"\n📋 Products DF columns: {products_df.columns.tolist()}")
    
    def evaluate(self, retrieve_k=50, rerank_k=20, top_k_list=[1, 5, 10]) -> Dict:
        """Evaluate retrieval with proper name-based matching"""
        
        metrics = defaultdict(int)
        precision_at_k = {k: [] for k in top_k_list}
        recall_at_k = {k: [] for k in top_k_list}
        
        segment_hits = {k: 0 for k in top_k_list}
        family_hits = {k: 0 for k in top_k_list}
        class_hits = {k: 0 for k in top_k_list}
        
        total = 0
        skipped = 0
        
        print("\n" + "="*70)
        print("STARTING EVALUATION")
        print("="*70)
        
        for idx, row in self.products_df.iterrows():
            # Get description
            description = row.get('Original Description', '')
            
            # Get true commodity NAME (this is what we have in products_df)
            true_commodity_name = row.get('UNSPSC Commodity Name', '')
            
            # Skip if missing
            if not description or not true_commodity_name:
                skipped += 1
                if idx < 3:
                    print(f"⚠️ Skipping row {idx}: no description or commodity name")
                continue
            
            # Convert name → code using catalog
            true_commodity_name_clean = true_commodity_name.strip().lower()
            true_commodity_code = self.name_to_code.get(true_commodity_name_clean)
            
            if not true_commodity_code:
                skipped += 1
                if idx < 3:
                    print(f"⚠️ Skipping row {idx}: commodity '{true_commodity_name}' not found in catalog")
                continue
            
            total += 1
            
            # Get true hierarchy from catalog
            true_hierarchy = self.code_to_hierarchy.get(true_commodity_code, {})
            true_segment = true_hierarchy.get('Segment Name', '')
            true_family = true_hierarchy.get('Family Name', '')
            true_class = true_hierarchy.get('Class Name', '')
            
            # Debug first 3 products
            if idx < 3:
                print(f"\n📋 Product #{idx}:")
                print(f"  Description: {description[:60]}...")
                print(f"  True Commodity: {true_commodity_name}")
                print(f"  True Code: {true_commodity_code}")
                print(f"  True Segment: {true_segment}")
                print(f"  True Family: {true_family}")
                print(f"  True Class: {true_class}")
            
            try:
                # Retrieve candidates
                candidates = self.retriever.retrieve(description, top_k=retrieve_k)
                
                if not candidates:
                    continue
                
                # Rerank
                candidates = self.reranker.rerank(description, candidates, top_n=rerank_k)
                final_candidates = self.merger.merge(candidates)
                
                # Extract predicted codes and names
                pred_codes = []
                pred_names = []
                
                for c in final_candidates:
                    code = c.get('commodity_code')
                    name = c.get('commodity_name', '')
                    
                    try:
                        pred_codes.append(int(code))
                        pred_names.append(name.strip().lower())
                    except (ValueError, TypeError):
                        pred_codes.append(None)
                        pred_names.append('')
                
                # Debug first 3 products
                if idx < 3:
                    print(f"  Top 3 Predictions:")
                    for i in range(min(3, len(pred_names))):
                        print(f"    {i+1}. {pred_names[i][:50]} (code: {pred_codes[i]})")
                
                # Evaluate each K
                for k in top_k_list:
                    top_k_codes = pred_codes[:k]
                    top_k_names = pred_names[:k]
                    
                    # Commodity match (by CODE or NAME)
                    commodity_match = (
                        true_commodity_code in top_k_codes or
                        true_commodity_name_clean in top_k_names
                    )
                    
                    metrics[f'top{k}'] += int(commodity_match)
                    precision_at_k[k].append(int(commodity_match) / k)
                    recall_at_k[k].append(int(commodity_match) / 1)
                    
                    # Hierarchy matching
                    segment_match = False
                    family_match = False
                    class_match = False
                    
                    for pred_code in top_k_codes:
                        if pred_code is None:
                            continue
                        
                        pred_hierarchy = self.code_to_hierarchy.get(pred_code, {})
                        
                        if pred_hierarchy.get('Segment Name') == true_segment:
                            segment_match = True
                        if pred_hierarchy.get('Family Name') == true_family:
                            family_match = True
                        if pred_hierarchy.get('Class Name') == true_class:
                            class_match = True
                    
                    segment_hits[k] += int(segment_match)
                    family_hits[k] += int(family_match)
                    class_hits[k] += int(class_match)
                
            except Exception as e:
                print(f"\n Error processing row {idx}: {e}")
                import traceback
                traceback.print_exc()
                continue
        
        print(f"\n Evaluation complete:")
        print(f"   Total evaluated: {total}")
        print(f"   Skipped: {skipped}")
        
        if total == 0:
            return {
                "error": "No valid products to evaluate", 
                "skipped": skipped,
                "hint": "Check if commodity names in products_df match catalog exactly"
            }
        
        # Calculate final results
        results = {}
        for k in top_k_list:
            results[f"Top-{k} Accuracy"] = (metrics[f'top{k}'] / total) * 100
            results[f"Precision@{k}"] = np.mean(precision_at_k[k]) * 100 if precision_at_k[k] else 0
            results[f"Recall@{k}"] = np.mean(recall_at_k[k]) * 100 if recall_at_k[k] else 0
            results[f"Segment Acc @{k}"] = (segment_hits[k] / total) * 100
            results[f"Family Acc @{k}"] = (family_hits[k] / total) * 100
            results[f"Class Acc @{k}"] = (class_hits[k] / total) * 100
        
        results["Total Evaluated"] = total
        results["Skipped"] = skipped
        
        return results


# ============================================================================
# VERIFICATION HELPER
# ============================================================================
def verify_data_compatibility(products_df, catalog_df):
    """Check if product commodity names exist in catalog"""
    
    print("\n" + "="*70)
    print("DATA COMPATIBILITY CHECK")
    print("="*70)
    
    # Get all commodity names from catalog
    catalog_names = set(catalog_df['Commodity Name'].str.strip().str.lower())
    
    print(f"\n1️ Catalog Info:")
    print(f"   Total commodities: {len(catalog_names)}")
    print(f"   Sample names: {list(catalog_names)[:3]}")
    
    # Get all commodity names from products
    product_names = products_df['UNSPSC Commodity Name'].str.strip().str.lower()
    unique_product_names = set(product_names)
    
    print(f"\n2️ Products Info:")
    print(f"   Total products: {len(products_df)}")
    print(f"   Unique commodity names: {len(unique_product_names)}")
    print(f"   Sample names: {list(unique_product_names)[:3]}")
    
    # Check matches
    matches = unique_product_names.intersection(catalog_names)
    missing = unique_product_names - catalog_names
    
    print(f"\n3️ Matching Analysis:")
    print(f"    Matching names: {len(matches)}/{len(unique_product_names)}")
    print(f"    Missing from catalog: {len(missing)}")
    
    if missing:
        print(f"\n   Missing names (first 5):")
        for name in list(missing)[:5]:
            print(f"      - '{name}'")
    
    match_rate = len(matches) / len(unique_product_names) * 100 if unique_product_names else 0
    print(f"\n   Match rate: {match_rate:.1f}%")
    
    if match_rate < 100:
        print(f"\n    WARNING: Not all product names found in catalog!")
        print(f"   This will cause evaluation to skip {len(missing)} unique commodities")
    
    print("="*70 + "\n")
    
    return {
        'total_products': len(products_df),
        'unique_names': len(unique_product_names),
        'matches': len(matches),
        'missing': len(missing),
        'match_rate': match_rate
    }


# ============================================================================
# RUN COMPLETE EVALUATION
# ============================================================================

print(" Step 1: Verifying data compatibility...")
compatibility = verify_data_compatibility(products_df, df)

print("\n🔧 Step 2: Initializing evaluator...")
evaluator = ProductEvaluator(products_df, df, retriever, reranker, merger, cleaner)

print("\n Step 3: Running evaluation...")
results = evaluator.evaluate(retrieve_k=50, rerank_k=20, top_k_list=[1, 5, 10])

print("\n" + "="*70)
print("FINAL EVALUATION RESULTS")
print("="*70)

for metric, value in results.items():
    if isinstance(value, (int, float)):
        if metric in ["Total Evaluated", "Skipped"]:
            print(f"{metric:25s}: {value}")
        else:
            print(f"{metric:25s}: {value:6.2f}%")

print("="*70)

# Sanity check
if 'Top-1 Accuracy' in results and results.get('Segment Acc @1', 0) > 0:
    top1 = results['Top-1 Accuracy']
    seg1 = results['Segment Acc @1']
    
    print(f"\n SANITY CHECK:")
    print(f"   Top-1 Accuracy: {top1:.1f}%")
    print(f"   Segment Acc @1: {seg1:.1f}%")
    
    if seg1 >= top1:
        print(f"    PASS: Segment ≥ Commodity (as expected)")
    else:
        print(f"    FAIL: Segment < Commodity (shouldn't happen!)")

print("\n")


print(" All classes are defined")


 Step 1: Verifying data compatibility...

DATA COMPATIBILITY CHECK

1️ Catalog Info:
   Total commodities: 71502
   Sample names: ['canned or jarred red beard bunching onions', 'canned or jarred organic maraschino cherries', 'educational or teacher associations']

2️ Products Info:
   Total products: 100
   Unique commodity names: 72
   Sample names: ['refrigerant compressors', 'towel bar or ring or stand or hook', 'furnaces']

3️ Matching Analysis:
    Matching names: 72/72
    Missing from catalog: 0

   Match rate: 100.0%


🔧 Step 2: Initializing evaluator...
 Built lookups:
   - Code → Hierarchy: 71502 entries
   - Name → Code: 71502 entries

 Products DF columns: ['Original Description', 'UNSPSC Commodity Name']

 Step 3: Running evaluation...

STARTING EVALUATION

 Product #0:
  Description: BW RG2PV75H6X 75 GAL LP GAS POWER VENTED WATER HEATER STANDA...
  True Commodity: Domestic water heaters
  True Code: 40101825
  True Segment: Distribution and Conditioning Systems and Equipm

In [14]:
print("\n🔧 Initializing components...")

cleaner = TextCleaner(remove_arbitrary_alphanum=False)
retriever = HybridRetriever(df, loaded_vectorstore, embeddings, cleaner=cleaner)
reranker = CrossEncoderReranker()
merger = ScoreMerger(alpha=0.4, beta=0.6)

print(" All components are initialized")


🔧 Initializing components...
 BM25 index ready
 Cross-encoder loaded on cpu
 All components are initialized


In [15]:
print("\n Testing on a single product...")

test_description = products_df.iloc[0]['Original Description']
print(f"Product: {test_description}")

candidates = retriever.retrieve(test_description, top_k=50)
print(f"Retrieved {len(candidates)} candidates")

candidates = reranker.rerank(test_description, candidates, top_n=10)
print(f"Reranked to top {len(candidates)}")

final_candidates = merger.merge(candidates)

print("\n Top 5 Predictions:")
for i, c in enumerate(final_candidates[:5], 1):
    print(f"{i}. {c['commodity_name']} (Score: {c['final_score']:.4f})")

print(f"\nThe Ground Truth: {products_df.iloc[0].get('UNSPSC Commodity Name', 'N/A')}")




🧪 Testing on a single product...
Product: BW RG2PV75H6X 75 GAL LP GAS POWER VENTED WATER HEATER STANDARD W/SIDE CONNECTIONS FOR SPACE HTG 76KBTU 6YR 70""H X 26""DIA
Retrieved 100 candidates
Reranked to top 10

 Top 5 Predictions:
1. Gas fueled fireplace B vent (Score: 0.6084)
2. Gas turbine generator (Score: 0.4500)
3. Gas turbine control panels (Score: 0.4472)
4. Space heaters (Score: 0.3888)
5. Gas generators (Score: 0.3794)

 The Ground Truth: Domestic water heaters


In [18]:
print("\n" + "="*70)
print("RUNNING FULL EVALUATION:")
print("="*70)

evaluator = ProductEvaluator(products_df,df,retriever, reranker, merger, cleaner)
results = evaluator.evaluate(retrieve_k=50, rerank_k=20, top_k_list=[1, 5, 10])
print(evaluator)
print(results)
print("\n" + "="*70)
print("EVALUATION RESULTS:")
print("="*70)

for metric, value in results.items():
    if isinstance(value, (int, float)):
        if metric == "Total Evaluated":
            print(f"{metric:25s}: {value}")
        else:
            print(f"{metric:25s}: {value:6.2f}%")

print("="*70 + "\n")


RUNNING FULL EVALUATION:
<__main__.ProductEvaluator object at 0x000001754CAFD810>
{'error': 'No valid products to evaluate'}

EVALUATION RESULTS:



In [None]:
# Save results to CSV
results_df = pd.DataFrame([results])
results_df.to_csv("evaluation_results.csv", index=False)
print(" Results saved to evaluation_results.csv")