## EDA


In [2]:
import pandas as pd
import sklearn as sk
import numpy as np


In [3]:
df=pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_unspsc_data.csv")
df.head(5)

Unnamed: 0,Segment Code,Segment Name,Family Code,Family Name,Class Code,Class Name,Commodity Code,Commodity Name
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71502 entries, 0 to 71501
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Segment Code    71502 non-null  int64 
 1   Segment Name    71502 non-null  object
 2   Family Code     71502 non-null  int64 
 3   Family Name     71502 non-null  object
 4   Class Code      71502 non-null  int64 
 5   Class Name      71502 non-null  object
 6   Commodity Code  71502 non-null  int64 
 7   Commodity Name  71502 non-null  object
dtypes: int64(4), object(4)
memory usage: 4.4+ MB


In [5]:
##checking for null and missing values
df.isna().sum()


Segment Code      0
Segment Name      0
Family Code       0
Family Name       0
Class Code        0
Class Name        0
Commodity Code    0
Commodity Name    0
dtype: int64

In [6]:
##checking for duplicates
df.duplicated().sum()

np.int64(0)

In [7]:
df.nunique()

Segment Code         57
Segment Name         57
Family Code         465
Family Name         465
Class Code         5313
Class Name         5313
Commodity Code    71502
Commodity Name    71502
dtype: int64

Therefore we can consider that, this dataset is clean. We have a unique Commodity Code for each commodity in dataset.

In [8]:
#now we are going to create a corpus for retriver
df['corpus']=(df["Segment Name"]+" "+df["Family Name"]+" "+df["Class Name"]+" "+df["Commodity Name"]).str.lower()
df["corpus"].head(4)
df["corpus"][0]

'live plant and animal material and accessories and supplies live animals livestock cats'

In [9]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [10]:
chunks=[]
for i,row in df.iterrows():
    doc=Document(
        page_content=row["corpus"],
        metadata={
            "Segment Code": row["Segment Code"],
            "Segment Name": row["Segment Name"],
            "Family Code": row["Family Code"],
            "Family Name": row["Family Name"],
            "Class Code": row["Class Code"],
            "Class Name": row["Class Name"],
            "Commodity Code": row["Commodity Code"],
            "Commodity Name": row["Commodity Name"],
        }
    )
    chunks.append(doc)

print(chunks[0])


page_content='live plant and animal material and accessories and supplies live animals livestock cats' metadata={'Segment Code': 10000000, 'Segment Name': 'Live Plant and Animal Material and Accessories and Supplies', 'Family Code': 10100000, 'Family Name': 'Live animals', 'Class Code': 10101500, 'Class Name': 'Livestock', 'Commodity Code': 10101501, 'Commodity Name': 'Cats'}


In [11]:
#successfully converted the dataset into documents 
print(chunks[38024])

page_content='food beverage and tobacco products fresh vegetables chicories salad king chicory' metadata={'Segment Code': 50000000, 'Segment Name': 'Food Beverage and Tobacco Products', 'Family Code': 50400000, 'Family Name': 'Fresh vegetables', 'Class Code': 50402900, 'Class Name': 'Chicories', 'Commodity Code': 50402907, 'Commodity Name': 'Salad king chicory'}


In [12]:
embeddings=HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

In [13]:
#small portion of dataset just to check
#small_shunk=chunks[:500]

In [14]:
# vectorstore=FAISS.from_documents(
#     documents=chunks,
#     embedding=embeddings,
# )

In [15]:
# vectorstore.save_local("faiss.index")


In [16]:
loaded_vectorstore=FAISS.load_local(
    "faiss.index",
    embeddings,
    allow_dangerous_deserialization=True,
)
num_documents = len(loaded_vectorstore.docstore._dict)
num_documents

71502

In [17]:
#addressing the search queries sample
loaded_vectorstore.similarity_search(query="cat snacks",k=5)

[Document(id='3269fd11-28f7-49d4-b38f-a6860b73803e', metadata={'Segment Code': 10000000, 'Segment Name': 'Live Plant and Animal Material and Accessories and Supplies', 'Family Code': 10120000, 'Family Name': 'Animal feed', 'Class Code': 10121800, 'Class Name': 'Dog and cat food', 'Commodity Code': 10121805, 'Commodity Name': 'Moist food for cats'}, page_content='live plant and animal material and accessories and supplies animal feed dog and cat food moist food for cats'),
 Document(id='ad157682-7e16-4b54-919a-ab665e8402d8', metadata={'Segment Code': 10000000, 'Segment Name': 'Live Plant and Animal Material and Accessories and Supplies', 'Family Code': 10120000, 'Family Name': 'Animal feed', 'Class Code': 10121800, 'Class Name': 'Dog and cat food', 'Commodity Code': 10121804, 'Commodity Name': 'Dry food for cats'}, page_content='live plant and animal material and accessories and supplies animal feed dog and cat food dry food for cats'),
 Document(id='e487aa78-d96d-40fb-ada1-d34cd276fb0f

In [18]:
loaded_vectorstore.similarity_search_with_relevance_scores(query="bird seeds parrot food",k=100)

[(Document(id='694a7c08-3f6c-40e4-a403-bab23cd43f4a', metadata={'Segment Code': 10000000, 'Segment Name': 'Live Plant and Animal Material and Accessories and Supplies', 'Family Code': 10120000, 'Family Name': 'Animal feed', 'Class Code': 10121600, 'Class Name': 'Bird and fowl food', 'Commodity Code': 10121602, 'Commodity Name': 'Bird seed'}, page_content='live plant and animal material and accessories and supplies animal feed bird and fowl food bird seed'),
  np.float32(0.47228205)),
 (Document(id='990b20bd-b536-45df-b48a-3b2b3ecc0f96', metadata={'Segment Code': 50000000, 'Segment Name': 'Food Beverage and Tobacco Products', 'Family Code': 50130000, 'Family Name': 'Dairy products and eggs', 'Class Code': 50131600, 'Class Name': 'Eggs and egg substitutes', 'Commodity Code': 50131624, 'Commodity Name': 'In shell nest run egg from birds other than chickens'}, page_content='food beverage and tobacco products dairy products and eggs eggs and egg substitutes in shell nest run egg from birds 

In [19]:
# retirver=loaded_vectorstore.as_retriever(
#     search_type="similarity",
#     search_kwargs={"k":3}
# )
# retirver.stream(input="bird seeds parrot food")

In [20]:
search_queries=pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")
search_queries.head()

Unnamed: 0,Search Query,UNSPSC Commodity Name
0,PVC INSERT,PVC plastic pipe adapter
1,Snap-In Bend Support,Plumbing hangers
2,BACKSTOP WELL TANK,Water storage tanks
3,RADFIT 3/4 BEND SUPPORT,Plumbing hangers
4,SMOKE TEE,Carbon steel pipe tee


In [21]:
class HybridRetriever:
    def __init__(self, df, vectorstore, embeddings):
        from rank_bm25 import BM25Okapi
        self.df = df
        self.vectorstore = vectorstore
        self.embeddings = embeddings
        # BM25 initialization
        self.tokenized_docs = [d.lower().split() for d in df['Commodity Name'].tolist()]
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print("BM25 index ready.")

    def retrieve(self, query, top_k=50):
        # Semantic
        sem_results = self.vectorstore.similarity_search_with_score(query, k=top_k)
        for c in sem_results:
            c['sem_score'] = float(c.metadata.get('score', 1.0))  # fallback
        # Lexical BM25
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(-bm25_scores)[:top_k]
        lex_results = []
        for idx in top_indices:
            lex_results.append({
                'doc_text': df.iloc[idx]['Commodity Name'],
                'metadata': df.iloc[idx].to_dict(),
                'lex_score': float(bm25_scores[idx])
            })
        # Merge sem + lex, deduplicate by doc_text
        merged = {c['doc_text']: c for c in sem_results}
        for c in lex_results:
            if c['doc_text'] in merged:
                merged[c['doc_text']]['lex_score'] = c['lex_score']
            else:
                merged[c['doc_text']] = c
        return list(merged.values())

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

class CrossEncoderReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", device=None):
        """
        Initialize the cross-encoder reranker.
        model_name: HuggingFace model suitable for cross-encoder reranking
        device: 'cuda' or 'cpu'
        """
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    def rerank(self, query, candidates, top_n=5):
        """
        Rerank candidates given a query.
        query: str
        candidates: list of dicts, each containing 'doc_text' and optionally 'metadata'
        top_n: number of top candidates to return
        """
        candidate_texts = [c['doc_text'] for c in candidates]

        # Encode query-candidate pairs
        encodings = self.tokenizer([query]*len(candidate_texts), candidate_texts,
                                   padding=True, truncation=True, return_tensors="pt").to(self.device)

        with torch.no_grad():
            scores = self.model(**encodings).logits.squeeze(-1)
            scores = scores.cpu().numpy()

        # Attach scores to candidates
        for i, cand in enumerate(candidates):
            cand['cross_score'] = float(scores[i])

        # Sort by score descending
        reranked = sorted(candidates, key=lambda x: x['cross_score'], reverse=True)

        return reranked[:top_n]

# ---------------------------
# Example Usage
# ---------------------------
# Initialize reranker
reranker = CrossEncoderReranker()

# Query
query_text = "PVC INSERT"
#addressing the search queries sample
candidates=loaded_vectorstore.similarity_search_with_score(query=query_text,k=50)
candidate_dicts = [{'doc_text': d.page_content, 'metadata': d.metadata} for d in candidates]
# Suppose candidates is your top-k list from FAISS/BM25 hybrid retriever
# candidates = [{'doc_text': "...", 'metadata': {...}}, ...] 

top_n = 5
reranked_results = reranker.rerank(query_text, candidate_dicts, top_n=top_n)

# Display
for i, r in enumerate(reranked_results, 1):
    print(f"{i}. Score: {r['cross_score']:.4f} | Candidate: {r['doc_text']}")


AttributeError: 'tuple' object has no attribute 'page_content'

In [None]:
import numpy as np

class ScoreMerger:
    """
    Combines hybrid retriever scores (semantic + lexical) with cross-encoder scores
    and produces a final ranked list of candidates.
    """
    def __init__(self, alpha=0.5, beta=0.5):
        """
        alpha: weight for hybrid iever score
        beta: weight for cross-encoder score
        alpha + beta should ideally sum to 1
        """
        self.alpha = alpha
        self.beta = beta

    def normalize_scores(self, scores):
        """Normalize a list/array of scores to 0-1 range"""
        scores = np.array(scores, dtype=np.float32)
        min_s = scores.min()
        max_s = scores.max()
        if max_s - min_s == 0:
            return np.ones_like(scores)  # all same score
        return (scores - min_s) / (max_s - min_s)

    def merge(self, candidates):
        """
        candidates: list of dicts with keys:
            - 'sem_score' : hybrid iever score
            - 'lex_score' : lexical score (optional)
            - 'cross_score' : cross-encoder score
            - 'doc_text' : document text
            - 'metadata' : optional metadata
        Returns:
            list of candidates sorted by final combined score
        """
        sem_scores = [c.get("sem_score", 0) for c in candidates]
        lex_scores = [c.get("lex_score", 0) for c in candidates]
        cross_scores = [c.get("cross_score", 0) for c in candidates]

        # Combine semantic + lexical hybrid score first
        hybrid_scores = np.array(sem_scores) + np.array(lex_scores)
        hybrid_scores = self.normalize_scores(hybrid_scores)
        cross_scores = self.normalize_scores(cross_scores)

        # Weighted combination
        final_scores = self.alpha * hybrid_scores + self.beta * cross_scores

        # Attach final score to candidates
        for i, c in enumerate(candidates):
            c["final_score"] = float(final_scores[i])

        # Sort descending
        reranked = sorted(candidates, key=lambda x: x["final_score"], reverse=True)
        return reranked


In [None]:
# Suppose you have candidates from your hybrid iever
# candidates = loaded_vectorstore.similarity_search(query="PVC INSERT", k=50)
# and cross-encoder scores already attached

merger = ScoreMerger(alpha=0.4, beta=0.6)  # tweak weights if needed
final_ranked = merger.merge(candidate_dicts)

# Display top 5
for i, c in enumerate(final_ranked[:5], 1):
    print(f"{i}. Score: {c['final_score']:.4f} | Candidate: {c['doc_text']}")


1. Score: 1.0000 | Candidate: distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe connectors pvc plastic pipe connector
2. Score: 0.9934 | Candidate: manufacturing components and supplies adhesives and sealants tape polyvinyl chloride pvc tape
3. Score: 0.9795 | Candidate: manufacturing components and supplies moldings thermoplastic molding inserts thermoplastic gas assisted injection molding insert
4. Score: 0.9687 | Candidate: distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe plugs pvc plastic pipe plug
5. Score: 0.9496 | Candidate: manufacturing components and supplies moldings thermoplastic molding inserts thermoplastic high precision injection molding insert


In [None]:
queries_list = search_queries.to_dict('records')

# Initialize
retriever = HybridRetriever(df, loaded_vectorstore, embeddings)
reranker = CrossEncoderReranker()
merger = ScoreMerger(alpha=0.5, beta=0.5)

# Evaluation accumulators
top1_correct = 0
top5_correct = 0
top10_correct = 0
precision_at_5 = []
recall_at_5 = []

# Loop through queries
for q in queries_list:
    query_text = q['Search Query']
    true_label = q['UNSPSC Commodity Name']

    # Step 1: Hybrid retrieval
    candi = retriever.retrieve(query_text, top_k=50)

    # Step 2: Cross-encoder rerank top 10
    candi = reranker.rerank(query_text, candidates, top_n=10)

    # Step 3: Merge scores (weighted)
    final_candidates = merger.merge(candi)

    # Step 4: Evaluate Top-K
    top1 = [c['doc_text'] for c in final_candidates[:1]]
    top5 = [c['doc_text'] for c in final_candidates[:5]]
    top10 = [c['doc_text'] for c in final_candidates[:10]]

    top1_correct += int(true_label in top1)
    top5_correct += int(true_label in top5)
    top10_correct += int(true_label in top10)

    # Precision@5, Recall@5
    precision_at_5.append(int(true_label in top5)/5)
    recall_at_5.append(int(true_label in top5)/1)  # only one relevant per query

# -------------------------------
# Metrics
num_queries = len(queries_list)
print(f"Top-1 Accuracy: {top1_correct/num_queries*100:.2f}%")
print(f"Top-5 Accuracy: {top5_correct/num_queries*100:.2f}%")
print(f"Top-10 Accuracy: {top10_correct/num_queries*100:.2f}%")
print(f"Precision@5: {np.mean(precision_at_5)*100:.2f}%")
print(f"Recall@5: {np.mean(recall_at_5)*100:.2f}%")

BM25 index ready.


TypeError: 'Document' object does not support item assignment

In [None]:
def convert_docs_to_dicts(results, include_scores=True):
    """
    Converts a list of Document objects (from LangChain/FAISS/BM25 ievers)
    into a standardized list of dictionaries.
    
    Args:
        results: list of (Document, score) tuples or just Document objects
        include_scores: whether to include 'sem_score' key (if scores are provided)

    Returns:
        List[dict] with keys: 'doc_text', 'metadata', 'sem_score' (optional)
    """
    converted = []
    for item in results:
        if isinstance(item, tuple):  # (Document, score)
            doc, score = item
            entry = {
                "doc_text": doc.page_content,
                "metadata": getattr(doc, "metadata", {}),
            }
            if include_scores:
                entry["sem_score"] = float(score)
        else:  # Just Document (no score)
            entry = {
                "doc_text": item.page_content,
                "metadata": getattr(item, "metadata", {}),
            }
            if include_scores:
                entry["sem_score"] = 0.0
        converted.append(entry)
    return converted


In [None]:
# -------------------------------
# Full Evaluation Pipeline Cell
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score

# -------------------------------
# Assume you already have:
# df = catalog dataframe
# loaded_vectorstore = FAISS vectorstore
# embeddings = embedding model
# -------------------------------

# Hybrid Retriever Class (already implemented)
class HybridRetriever:
    def __init__(self, df, vectorstore, embeddings):
        from rank_bm25 import BM25Okapi
        self.df = df
        self.vectorstore = vectorstore
        self.embeddings = embeddings
        # BM25 initialization
        self.tokenized_docs = [d.lower().split() for d in df['Commodity Name'].tolist()]
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print("BM25 index ready.")

    def retrieve(self, query, top_k=50):
        # Semantic
        sem_results = self.vectorstore.similarity_search_with_score(query, k=top_k)
        for c in sem_results:
            c['sem_score'] = float(c.metadata.get('score', 1.0))  # fallback
        # Lexical BM25
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(-bm25_scores)[:top_k]
        lex_results = []
        for idx in top_indices:
            lex_results.append({
                'doc_text': df.iloc[idx]['Commodity Name'],
                'metadata': df.iloc[idx].to_dict(),
                'lex_score': float(bm25_scores[idx])
            })
        # Merge sem + lex, deduplicate by doc_text
        merged = {c['doc_text']: c for c in sem_results}
        for c in lex_results:
            if c['doc_text'] in merged:
                merged[c['doc_text']]['lex_score'] = c['lex_score']
            else:
                merged[c['doc_text']] = c
        return list(merged.values())

# Cross-Encoder Reranker Class (already implemented)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

class CrossEncoderReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    def rerank(self, query, candidates, top_n=5):
        candidate_texts = [c['doc_text'] for c in candidates]
        encodings = self.tokenizer([query]*len(candidate_texts), candidate_texts,
                                   padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            scores = self.model(**encodings).logits.squeeze(-1)
            scores = scores.cpu().numpy()
        for i, cand in enumerate(candidates):
            cand['cross_score'] = float(scores[i])
        reranked = sorted(candidates, key=lambda x: x['cross_score'], reverse=True)
        return reranked[:top_n]

# Score Merger Class (already implemented)
class ScoreMerger:
    def __init__(self, alpha=0.5, beta=0.5):
        self.alpha = alpha
        self.beta = beta
    def normalize_scores(self, scores):
        scores = np.array(scores, dtype=np.float32)
        min_s, max_s = scores.min(), scores.max()
        if max_s - min_s == 0:
            return np.ones_like(scores)
        return (scores - min_s) / (max_s - min_s)
    def merge(self, candidates):
        sem_scores = [c.get('sem_score',0) for c in candidates]
        lex_scores = [c.get('lex_score',0) for c in candidates]
        cross_scores = [c.get('cross_score',0) for c in candidates]
        hybrid_scores = self.normalize_scores(np.array(sem_scores)+np.array(lex_scores))
        cross_scores = self.normalize_scores(cross_scores)
        final_scores = self.alpha*hybrid_scores + self.beta*cross_scores
        for i, c in enumerate(candidates):
            c['final_score'] = float(final_scores[i])
        return sorted(candidates, key=lambda x: x['final_score'], reverse=True)

# -------------------------------
# Load labeled queries
queries_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")
queries_list = queries_df.to_dict('records')

# Initialize
retriever = HybridRetriever(df, loaded_vectorstore, embeddings)
reranker = CrossEncoderReranker()
merger = ScoreMerger(alpha=0.5, beta=0.5)

# Evaluation accumulators
top1_correct = 0
top5_correct = 0
top10_correct = 0
precision_at_5 = []
recall_at_5 = []

# Loop through queries
for q in queries_list:
    query_text = q['Search Query']
    true_label = q['UNSPSC Commodity Name']

    # Step 1: Hybrid retrieval
    candidates = retriever.retrieve(query_text, top_k=50)

    # Step 2: Cross-encoder rerank top 10
    candidates = reranker.rerank(query_text, candidates, top_n=10)

    # Step 3: Merge scores (weighted)
    final_candidates = merger.merge(candidates)

    # Step 4: Evaluate Top-K
    top1 = [c['doc_text'] for c in final_candidates[:1]]
    top5 = [c['doc_text'] for c in final_candidates[:5]]
    top10 = [c['doc_text'] for c in final_candidates[:10]]

    top1_correct += int(true_label in top1)
    top5_correct += int(true_label in top5)
    top10_correct += int(true_label in top10)

    # Precision@5, Recall@5
    precision_at_5.append(int(true_label in top5)/5)
    recall_at_5.append(int(true_label in top5)/1)  # only one relevant per query

# -------------------------------
# Metrics
num_queries = len(queries_list)
print(f"Top-1 Accuracy: {top1_correct/num_queries*100:.2f}%")
print(f"Top-5 Accuracy: {top5_correct/num_queries*100:.2f}%")
print(f"Top-10 Accuracy: {top10_correct/num_queries*100:.2f}%")
print(f"Precision@5: {np.mean(precision_at_5)*100:.2f}%")
print(f"Recall@5: {np.mean(recall_at_5)*100:.2f}%")


BM25 index ready.


TypeError: 'Document' object does not support item assignment

In [25]:
# -------------------------------
# 📦 Imports
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from rank_bm25 import BM25Okapi

# -------------------------------
# 🧩 Utility — Convert Documents
# -------------------------------
def convert_docs_to_dicts(results, include_scores=True):
    """
    Converts Document objects (or tuples) from retrievers to a list of dicts.
    Ensures consistent format across FAISS and BM25 results.
    """
    converted = []
    for item in results:
        if isinstance(item, tuple):  # (Document, score)
            doc, score = item
            entry = {
                "doc_text": doc.page_content,
                "metadata": getattr(doc, "metadata", {}),
            }
            if include_scores:
                entry["sem_score"] = float(score)
        else:  # Just Document
            entry = {
                "doc_text": item.page_content,
                "metadata": getattr(item, "metadata", {}),
            }
            if include_scores:
                entry["sem_score"] = 0.0
        converted.append(entry)
    return converted


# -------------------------------
# ⚙️ Hybrid Retriever
# -------------------------------
class HybridRetriever:
    def __init__(self, df, vectorstore, embeddings):
        self.df = df
        self.vectorstore = vectorstore
        self.embeddings = embeddings
        self.tokenized_docs = [d.lower().split() for d in df['Commodity Name'].tolist()]
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print("✅ BM25 index ready.")

    def retrieve(self, query, top_k=50):
        # Semantic retrieval (FAISS)
        sem_results = self.vectorstore.similarity_search_with_score(query, k=top_k)
        sem_candidates = convert_docs_to_dicts(sem_results)

        # Lexical retrieval (BM25)
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(-bm25_scores)[:top_k]
        lex_results = [
            {
                'doc_text': self.df.iloc[idx]['Commodity Name'],
                'metadata': self.df.iloc[idx].to_dict(),
                'lex_score': float(bm25_scores[idx])
            }
            for idx in top_indices
        ]

        # Merge & deduplicate
        merged = {c['doc_text']: c for c in sem_candidates}
        for c in lex_results:
            if c['doc_text'] in merged:
                merged[c['doc_text']]['lex_score'] = c['lex_score']
            else:
                merged[c['doc_text']] = c
        return list(merged.values())


# -------------------------------
# 🤖 Cross-Encoder Reranker
# -------------------------------
class CrossEncoderReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    def rerank(self, query, candidates, top_n=10):
        candidate_texts = [c['doc_text'] for c in candidates]
        encodings = self.tokenizer([query]*len(candidate_texts), candidate_texts,
                                   padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            scores = self.model(**encodings).logits.squeeze(-1)
            scores = scores.cpu().numpy()
        for i, cand in enumerate(candidates):
            cand['cross_score'] = float(scores[i])
        return sorted(candidates, key=lambda x: x['cross_score'], reverse=True)[:top_n]


# -------------------------------
# ⚖️ Score Merger
# -------------------------------
class ScoreMerger:
    def __init__(self, alpha=0.35, beta=0.65):
        self.alpha = alpha
        self.beta = beta

    def normalize_scores(self, scores):
        scores = np.array(scores, dtype=np.float32)
        if scores.size == 0:
            return scores
        min_s, max_s = scores.min(), scores.max()
        if max_s - min_s == 0:
            return np.ones_like(scores)
        return (scores - min_s) / (max_s - min_s)

    def merge(self, candidates):
        sem_scores = [c.get('sem_score', 0) for c in candidates]
        lex_scores = [c.get('lex_score', 0) for c in candidates]
        cross_scores = [c.get('cross_score', 0) for c in candidates]

        hybrid_scores = self.normalize_scores(np.array(sem_scores) + np.array(lex_scores))
        cross_scores = self.normalize_scores(cross_scores)

        final_scores = self.alpha * hybrid_scores + self.beta * cross_scores

        for i, c in enumerate(candidates):
            c['final_score'] = float(final_scores[i])

        return sorted(candidates, key=lambda x: x['final_score'], reverse=True)


# -------------------------------
# 🧮 Evaluation Pipeline
# -------------------------------
queries_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")
queries_list = queries_df.to_dict('records')

retriever = HybridRetriever(df, loaded_vectorstore, embeddings)
reranker = CrossEncoderReranker()
merger = ScoreMerger(alpha=0.4, beta=0.6)

top1_correct = top5_correct = top10_correct = 0
precision_at_5 = []
recall_at_5 = []

for q in queries_list:
    query_text = q['Search Query']
    true_label = q['UNSPSC Commodity Name']

    # Step 1: Retrieve hybrid results
    candidates = retriever.retrieve(query_text, top_k=50)

    # Step 2: Cross-encoder rerank
    candidates = reranker.rerank(query_text, candidates, top_n=10)

    # Step 3: Merge scores
    final_candidates = merger.merge(candidates)

    # Step 4: Evaluate
    top1 = [c['doc_text'] for c in final_candidates[:1]]
    top5 = [c['doc_text'] for c in final_candidates[:5]]
    top10 = [c['doc_text'] for c in final_candidates[:10]]

    top1_correct += int(true_label in top1)
    top5_correct += int(true_label in top5)
    top10_correct += int(true_label in top10)

    precision_at_5.append(int(true_label in top5) / 5)
    recall_at_5.append(int(true_label in top5) / 1)

# -------------------------------
# 📊 Final Metrics
# -------------------------------
num_queries = len(queries_list)
print(f"✅ Top-1 Accuracy:  {top1_correct / num_queries * 100:.2f}%")
print(f"✅ Top-5 Accuracy:  {top5_correct / num_queries * 100:.2f}%")
print(f"✅ Top-10 Accuracy: {top10_correct / num_queries * 100:.2f}%")
print(f"✅ Precision@5:     {np.mean(precision_at_5) * 100:.2f}%")
print(f"✅ Recall@5:        {np.mean(recall_at_5) * 100:.2f}%")


✅ BM25 index ready.
✅ Top-1 Accuracy:  6.00%
✅ Top-5 Accuracy:  16.00%
✅ Top-10 Accuracy: 20.00%
✅ Precision@5:     3.20%
✅ Recall@5:        16.00%


In [27]:
import pandas as pd
import json

class TripletGenerator:
    """
    Generates (query, positive, negative) triplets for fine-tuning.
    Ensures no duplicate negatives per query.
    Exports triplets in JSONL format.
    """
    def __init__(self, queries_df, vectorstore, top_k=50, num_negatives=5):
        self.queries_df = queries_df
        self.vectorstore = vectorstore
        self.top_k = top_k
        self.num_negatives = num_negatives

    def generate_triplets(self):
        triplets = []

        # Group by unique query + positive combination
        grouped = self.queries_df.groupby(['Search Query', 'UNSPSC Commodity Name']).size().reset_index()

        for _, row in grouped.iterrows():
            query = row['Search Query']
            positive_label = row['UNSPSC Commodity Name']

            # Retrieve top-k candidates from hybrid retriever or FAISS
            candidates = self.vectorstore.similarity_search_with_score(query, k=self.top_k)

            # Extract candidate texts
            #candidate_texts = [c.page_content if hasattr(c, 'page_content') else c['doc_text'] for c in candidates]

            # Unpack documents from (Document, score) tuples
            candidate_texts = [doc.page_content for doc, score in candidates]

            # Select hard negatives: top N that are NOT the positive
            negatives = []
            for c in candidate_texts:
                if c != positive_label and c not in negatives:
                    negatives.append(c)
                if len(negatives) >= self.num_negatives:
                    break

            if negatives:  # only keep if at least one negative
                triplets.append({
                    "query": query,
                    "positive": positive_label,
                    "negatives": negatives
                })

        return triplets

    def save_jsonl(self, triplets, file_path):
        """
        Saves the triplets as JSONL (one JSON object per line)
        """
        with open(file_path, 'w', encoding='utf-8') as f:
            for t in triplets:
                f.write(json.dumps(t) + "\n")

# ---------------------------
# Example usage
# ---------------------------
queries_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")

triplet_gen = TripletGenerator(queries_df, loaded_vectorstore, top_k=50, num_negatives=5)
triplets = triplet_gen.generate_triplets()

# Save as JSONL
triplet_gen.save_jsonl(triplets, "triplets_dataset.jsonl")

# Quick preview
print(triplets[:3])


[{'query': '1 inch Viega PexPress coupling', 'positive': 'Brass pipe coupling', 'negatives': ['manufacturing components and supplies hardware couplings dura flex coupling', 'manufacturing components and supplies hardware couplings clamp coupling', 'manufacturing components and supplies hardware couplings miniature couplings', 'manufacturing components and supplies hardware couplings flange type flexible coupling', 'manufacturing components and supplies hardware couplings elastomeric couplings']}, {'query': '1-1/2 PVC DWV Trap Adapter', 'positive': 'PVC plastic pipe adapter', 'negatives': ['distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe adapters pvc plastic pipe adapter', 'distribution and conditioning systems and equipment and components fluid and gas distribution traps and strainers p trap', 'distribution and conditioning systems and equipment and components fluid and gas distribution traps and strainers j trap', 'distribution and

In [None]:
triplets[3:5]

[{'query': '1-1/4 x 1 ssfit bush 304',
  'positive': 'Stainless steel pipe bushing',
  'negatives': ['distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe bushings forged steel pipe bushing',
   'distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe bushings stainless steel pipe bushing',
   'manufacturing components and supplies bearings and bushings and wheels and gears bushings flange bushings',
   'commercial and military and private vehicles and their accessories and components transportation components and systems suspension system components automotive bushings',
   'manufacturing components and supplies bearings and bushings and wheels and gears bushings bushing sleeve']},
 {'query': '3 x 3 x 2 PVC DWV Reducing Sanitary Tee',
  'positive': 'PVC plastic pipe tee',
  'negatives': ['distribution and conditioning systems and equipment and components pipe piping and pipe fittings pipe te

In [28]:
import re
import pandas as pd

class ProductDescriptionCleaner:
    """
    Cleans the 'original_description' column in a product dataset.
    Can be extended with domain-specific replacements.
    """
    def __init__(self, domain_map=None):
        """
        domain_map: dict of word replacements for domain-specific terms.
        Future improvement :use a model or transformer to learn domain-specific replacements automatically instead of relying on a static map

        """
        self.domain_map = domain_map or {}

    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        # Replace domain-specific abbreviations
        words = text.split()
        words = [self.domain_map.get(w, w) for w in words]
        text = " ".join(words)
        # Remove punctuation/special chars
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove arbitrary alphanumeric patterns of length >=6
        #text = re.sub(r'\b[a-zA-Z0-9]{6,}\b', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def clean_dataframe(self, df, column='original_description', new_column='cleaned_description'):
        """
        Cleans the specified column in a DataFrame and adds a new column with cleaned text.
        """
        df[new_column] = df[column].apply(self.clean_text)
        return df

# ---------------------------
# Example usage
# ---------------------------
domain_map = {
    "ss": "stainless steel",
    "pvc": "polyvinyl chloride",
    "vlv": "valve",
    "adpt": "adapter",
    "bush": "bushing"
}

df_products = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_sample_product_unspsc.csv")
cleaner = ProductDescriptionCleaner(domain_map=domain_map)
df_products = cleaner.clean_dataframe(df_products, column='Original Description', new_column='cleaned_description')

df_products[['Original Description','cleaned_description']].head()



Unnamed: 0,Original Description,cleaned_description
0,BW RG2PV75H6X 75 GAL LP GAS POWER VENTED WATER...,bw rg2pv75h6x 75 gal lp gas power vented water...
1,VERSIPRO BISC ELG CFWC WOOD SEAT SI SW-985 ORA...,versipro bisc elg cfwc wood seat si sw 985 ora...
2,2 3000# Forged Steel Threaded Half Coupling FP...,2 3000 forged steel threaded half coupling fpt...
3,FUJ ASUH15KPAS 15000 BTU Standard-T,fuj asuh15kpas 15000 btu standard t
4,ALLIED 27W16 COMP ZP24K5E-PFV-830 2T R410A 230V-1,allied 27w16 comp zp24k5e pfv 830 2t r410a 230v 1


In [29]:
# -------------------------------
# 📦 Imports
# -------------------------------
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from rank_bm25 import BM25Okapi

# -------------------------------
# 🧩 Text Cleaner
# -------------------------------
class TextCleaner:
    """
    Cleans original product descriptions or queries.
    """
    def __init__(self, mappings=None, remove_arbitrary_alphanum=True):
        """
        mappings: dict of replacements, e.g. {"PVC": "polyvinyl chloride"}
        remove_arbitrary_alphanum: removes patterns like 'rg2pv75h6x'
        """
        self.mappings = mappings or {}
        self.remove_arbitrary_alphanum = remove_arbitrary_alphanum
        self.arbitrary_pattern = re.compile(r'\b[a-zA-Z0-9]{6,}\b')  # 6+ chars alphanumeric

    def clean(self, text):
        text = str(text).lower()
        # Apply mapping replacements
        for k, v in self.mappings.items():
            text = text.replace(k.lower(), v.lower())
        # Remove arbitrary alphanumeric patterns
        if self.remove_arbitrary_alphanum:
            text = self.arbitrary_pattern.sub('', text)
        # Remove unwanted symbols except spaces
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        # Collapse multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

# -------------------------------
# ⚙️ Hybrid Retriever
# -------------------------------
class HybridRetriever:
    def __init__(self, df, vectorstore, embeddings, cleaner=None):
        self.df = df
        self.vectorstore = vectorstore
        self.embeddings = embeddings
        self.cleaner = cleaner
        self.tokenized_docs = [self.cleaner.clean(d) if cleaner else d.lower().split()
                               for d in df['Commodity Name'].tolist()]
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print("✅ BM25 index ready.")

    def retrieve(self, query, top_k=50):
        query_clean = self.cleaner.clean(query) if self.cleaner else query
        # Semantic retrieval
        sem_results = self.vectorstore.similarity_search_with_score(query_clean, k=top_k)
        sem_candidates = convert_docs_to_dicts(sem_results)
        # Lexical BM25
        tokenized_query = query_clean.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(-bm25_scores)[:top_k]
        lex_results = [{"doc_text": self.df.iloc[idx]['Commodity Name'],
                        "metadata": self.df.iloc[idx].to_dict(),
                        "lex_score": float(bm25_scores[idx])} for idx in top_indices]
        # Merge
        merged = {c['doc_text']: c for c in sem_candidates}
        for c in lex_results:
            if c['doc_text'] in merged:
                merged[c['doc_text']]['lex_score'] = c['lex_score']
            else:
                merged[c['doc_text']] = c
        return list(merged.values())

# -------------------------------
# 🤖 Cross-Encoder Reranker
# -------------------------------
class CrossEncoderReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    def rerank(self, query, candidates, top_n=10):
        candidate_texts = [c['doc_text'] for c in candidates]
        encodings = self.tokenizer([query]*len(candidate_texts), candidate_texts,
                                   padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            scores = self.model(**encodings).logits.squeeze(-1).cpu().numpy()
        for i, cand in enumerate(candidates):
            cand['cross_score'] = float(scores[i])
        return sorted(candidates, key=lambda x: x['cross_score'], reverse=True)[:top_n]

# -------------------------------
# ⚖️ Score Merger
# -------------------------------
class ScoreMerger:
    def __init__(self, alpha=0.35, beta=0.65):
        self.alpha = alpha
        self.beta = beta

    def normalize_scores(self, scores):
        scores = np.array(scores, dtype=np.float32)
        if scores.size == 0: return scores
        min_s, max_s = scores.min(), scores.max()
        return np.ones_like(scores) if max_s - min_s == 0 else (scores - min_s)/(max_s - min_s)

    def merge(self, candidates):
        sem_scores = [c.get('sem_score',0) for c in candidates]
        lex_scores = [c.get('lex_score',0) for c in candidates]
        cross_scores = [c.get('cross_score',0) for c in candidates]
        hybrid_scores = self.normalize_scores(np.array(sem_scores) + np.array(lex_scores))
        cross_scores = self.normalize_scores(cross_scores)
        final_scores = self.alpha*hybrid_scores + self.beta*cross_scores
        for i, c in enumerate(candidates):
            c['final_score'] = float(final_scores[i])
        return sorted(candidates, key=lambda x: x['final_score'], reverse=True)

# -------------------------------
# 🧮 Evaluator (with cleaning)
# -------------------------------
class Evaluator:
    def __init__(self, queries_list, retriever, reranker, merger, cleaner=None):
        self.queries_list = queries_list
        self.retriever = retriever
        self.reranker = reranker
        self.merger = merger
        self.cleaner = cleaner

    def evaluate(self, top_k=(1,5,10)):
        metrics = {'top1':0, 'top5':0, 'top10':0, 'precision@5':[], 'recall@5':[],
                   'segment_acc':0, 'family_acc':0, 'class_acc':0}
        for q in self.queries_list:
            query_text = self.cleaner.clean(q['Search Query']) if self.cleaner else q['Search Query']
            true_label = q['UNSPSC Commodity Name']
            true_segment = q.get('Segment Name', '')
            true_family = q.get('Family Name', '')
            true_class = q.get('Class Name', '')

            candidates = self.retriever.retrieve(query_text, top_k=50)
            candidates = self.reranker.rerank(query_text, candidates, top_n=top_k[2])
            final_candidates = self.merger.merge(candidates)

            pred_docs = [c['doc_text'] for c in final_candidates]
            pred_metadata = [c['metadata'] for c in final_candidates]

            metrics['top1'] += int(true_label in pred_docs[:1])
            metrics['top5'] += int(true_label in pred_docs[:5])
            metrics['top10'] += int(true_label in pred_docs[:10])
            metrics['precision@5'].append(int(true_label in pred_docs[:5])/5)
            metrics['recall@5'].append(int(true_label in pred_docs[:5])/1)
            metrics['segment_acc'] += int(any(true_segment==m.get('Segment Name','') for m in pred_metadata[:5]))
            metrics['family_acc'] += int(any(true_family==m.get('Family Name','') for m in pred_metadata[:5]))
            metrics['class_acc'] += int(any(true_class==m.get('Class Name','') for m in pred_metadata[:5]))

        N = len(self.queries_list)
        return { "Top-1 Accuracy": metrics['top1']/N*100,
                 "Top-5 Accuracy": metrics['top5']/N*100,
                 "Top-10 Accuracy": metrics['top10']/N*100,
                 "Precision@5": np.mean(metrics['precision@5'])*100,
                 "Recall@5": np.mean(metrics['recall@5'])*100,
                 "Segment Accuracy": metrics['segment_acc']/N*100,
                 "Family Accuracy": metrics['family_acc']/N*100,
                 "Class Accuracy": metrics['class_acc']/N*100 }

# -------------------------------
# 🔹 Example Usage
# -------------------------------
queries_df = pd.read_csv("G:/Dylog_Internship_Assessments/data/dylog_search_queries_unspsc.csv")
queries_list = queries_df.to_dict('records')

cleaner = TextCleaner(remove_arbitrary_alphanum=True)
retriever = HybridRetriever(df, loaded_vectorstore, embeddings, cleaner=cleaner)
reranker = CrossEncoderReranker()
merger = ScoreMerger(alpha=0.4, beta=0.6)

evaluator = Evaluator(queries_list, retriever, reranker, merger, cleaner=cleaner)
results = evaluator.evaluate()
for k,v in results.items():
    print(f"✅ {k}: {v:.2f}%")


✅ BM25 index ready.
✅ Top-1 Accuracy: 0.00%
✅ Top-5 Accuracy: 0.00%
✅ Top-10 Accuracy: 0.00%
✅ Precision@5: 0.00%
✅ Recall@5: 0.00%
✅ Segment Accuracy: 0.00%
✅ Family Accuracy: 0.00%
✅ Class Accuracy: 0.00%
