In [6]:
import json
import re
from typing import List, Dict, Any
import numpy as np
from rank_bm25 import BM25Okapi

# -----------------------------
# Preprocessing helpers
# -----------------------------
TOKEN_RE = re.compile(r"[a-zA-Z0-9]+")
STOPWORDS = set(
    "a about above after again against all am an and any are aren't as at be because been before "
    "being below between both but by can't cannot could couldn't did didn't do does doesn't doing "
    "don't down during each few for from further had hadn't has hasn't have haven't having he he'd "
    "he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into "
    "isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or "
    "other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't "
    "so some such than that that's the their theirs them themselves then there there's these they they'd "
    "they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're "
    "we've were weren't what what's when when's where where's which while who who's whom why why's with "
    "won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
)

def normalize_text(text: str) -> str:
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"[_\-/\\]", " ", text)
    tokens = TOKEN_RE.findall(text)
    return " ".join(tokens)

def tokenize(text: str, remove_stopwords: bool = True) -> List[str]:
    tokens = normalize_text(text).split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in STOPWORDS]
    return tokens

# -----------------------------
# Parser
# -----------------------------
class Parser:
    def __init__(self, filepath: str):
        self.filepath = filepath
        self.documents: List[Dict[str, Any]] = []

    def load(self) -> List[Dict[str, Any]]:
        with open(self.filepath, "r", encoding="utf-8") as f:
            self.documents = json.load(f)
        return self.documents

    def parse(self) -> List[Dict[str, Any]]:
        if not self.documents:
            self.load()
        parsed_docs = []
        for i, item in enumerate(self.documents):
            attributes = " ".join(f"{k} {v}" for k, v in item.items() if k != "id")
            combined_text = attributes.strip()
            normalized = normalize_text(combined_text)
            tokens = tokenize(combined_text)
            brand = item.get("Brand", "")
            ptype = item.get("Type", "")
            pseudo_title = f"{brand} {ptype}".strip()
            if not pseudo_title:
                pseudo_title = list(item.values())[0]
            parsed_docs.append({
                "id": str(item.get("id", i)),
                "text": normalized,
                "tokens": tokens,
                "metadata": item,
                "title": pseudo_title
            })
        return parsed_docs

# -----------------------------
# PSO Reranker
# -----------------------------
class PSOReranker:
    def __init__(self, docs: List[Dict[str, Any]], top_k: int = 15):
        self.docs = docs
        self.top_k = top_k

    def compute_features(self, doc, query_tokens):
        bm25_score = doc.get("bm25_score", 0)
        attr_match = len(set(query_tokens) & set(doc["tokens"]))
        length_factor = 1 / np.log(1 + len(doc["tokens"]))
        return np.array([bm25_score, attr_match, length_factor])

    def fitness(self, weights, candidates, query_tokens):
        total_score = 0
        for doc in candidates:
            features = self.compute_features(doc, query_tokens)
            total_score += np.dot(weights, features)
        return total_score / len(candidates)

    def optimize(self, candidates, query, num_particles=20, max_iter=50):
        dim = 3
        query_tokens = tokenize(query)
        pos = np.random.uniform(0.1, 1.0, (num_particles, dim))
        vel = np.random.uniform(-0.1, 0.1, (num_particles, dim))
        pbest = pos.copy()
        pbest_scores = np.array([self.fitness(p, candidates, query_tokens) for p in pos])
        gbest_idx = np.argmax(pbest_scores)
        gbest = pbest[gbest_idx].copy()
        gbest_score = pbest_scores[gbest_idx]

        w = 0.5
        c1, c2 = 1.0, 2.0
        history = []

        for _ in range(max_iter):
            for i in range(num_particles):
                score = self.fitness(pos[i], candidates, query_tokens)
                if score > pbest_scores[i]:
                    pbest[i] = pos[i].copy()
                    pbest_scores[i] = score

            gbest_idx = np.argmax(pbest_scores)
            if pbest_scores[gbest_idx] > gbest_score:
                gbest = pbest[gbest_idx].copy()
                gbest_score = pbest_scores[gbest_idx]

            r1, r2 = np.random.rand(), np.random.rand()
            vel = w * vel + c1 * r1 * (pbest - pos) + c2 * r2 * (gbest - pos)
            pos = pos + vel
            history.append(gbest_score)

        return gbest, history

# -----------------------------
# Unified Search Engine
# -----------------------------
class SearchEngine:
    def __init__(self, docs: List[Dict[str, Any]]):
        self.docs = docs
        self.tokenized_corpus = [doc["tokens"] for doc in docs]
        self.bm25_engine = BM25Okapi(self.tokenized_corpus)

    def search(self, query: str, top_k: int = 15):
        query_tokens = tokenize(query)
        scores = self.bm25_engine.get_scores(query_tokens)
        candidates = []
        for doc, score in zip(self.docs, scores):
            doc_copy = doc.copy()
            doc_copy["bm25_score"] = score
            candidates.append(doc_copy)
    
        # take top 2*top_k for reranking
        candidates = sorted(candidates, key=lambda d: d["bm25_score"], reverse=True)[:2*top_k]
    
        # PSO reranking
        pso = PSOReranker(candidates, top_k=top_k)
        best_weights, history = pso.optimize(candidates, query)
        for doc in candidates:
            features = pso.compute_features(doc, query_tokens)
            doc["final_score"] = np.dot(best_weights, features)
    
        # Reranked top_k results with original JSON + rank + score
        reranked = sorted(candidates, key=lambda d: d["final_score"], reverse=True)[:top_k]
        results = []
        for rank, doc in enumerate(reranked, 1):
            result = doc["metadata"].copy()  # original JSON fields
            result["rank"] = rank
            result["score"] = float(doc["final_score"])
            results.append(result)
    
        return results, best_weights, history




In [5]:
# -----------------------------
# Example Usage
# -----------------------------
if __name__ == "__main__":
    parser = Parser("data-set.json")
    docs = parser.parse()

    engine = SearchEngine(docs)
    query = "prakash"
    results, weights, history = engine.search(query, top_k=15)

    print(f"Top 15 search results for query: \"{query}\"\n")
    for r in results:
        print(f"Rank {r['rank']} | ID={r.get('id','N/A')} | Score={r['score']:.4f}")
        # Print key details neatly
        for k, v in r.items():
            if k not in ["rank", "score"]:  # exclude added fields
                print(f"   {k}: {v}")
        print("-" * 80)

    print("Optimized PSO weights:", weights)


Top 15 search results for query: "prakash"

Rank 1 | ID=2019 | Score=49.8134
   Imprint: Prakash Books
   id: 2019
--------------------------------------------------------------------------------
Rank 2 | ID=2039 | Score=49.8134
   Imprint: Prakash Books
   id: 2039
--------------------------------------------------------------------------------
Rank 3 | ID=2085 | Score=49.8134
   Imprint: Prakash Books
   id: 2085
--------------------------------------------------------------------------------
Rank 4 | ID=2128 | Score=49.8134
   Imprint: Prakash Books
   id: 2128
--------------------------------------------------------------------------------
Rank 5 | ID=2161 | Score=49.8134
   Imprint: Prakash Books
   id: 2161
--------------------------------------------------------------------------------
Rank 6 | ID=2201 | Score=49.8134
   Imprint: Prakash Books
   id: 2201
--------------------------------------------------------------------------------
Rank 7 | ID=2240 | Score=49.8134
   Imprint: