In [1]:
import requests
import pandas as pd 
from urllib.parse import urlparse, urljoin, parse_qs
import json
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import math

In [2]:
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /home/ensai/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ensai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ensai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ensai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Data loading

In [3]:
def load_json(path):
    """Load JSON data from a file.

    Args:
        path (str): The file path to the JSON file.

    Returns:
        dict or list: The loaded JSON data.
    """
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [4]:
def load_products(path):
    """Load products from a JSONL file.

    Args:
        path (str): The file path to the JSONL file containing product data.

    Returns:
        list: A list of product dictionaries.
    """
    products = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            products.append(json.loads(line))
    return products

In [5]:
# Charger les index depuis des fichiers dans le dossier input
def load_index(index_name):
    """Load an inverted index from a JSON file.

    Args:
        index_name (str): The name of the index (e.g., 'title', 'description').

    Returns:
        dict: The inverted index where keys are tokens and values are dicts of doc_id to positions.
    """
    index_path = os.path.join('input', f'{index_name}_index.json')
    with open(index_path, 'r') as f:
        raw_index = json.load(f)
    
    index = {}
    for token, doc_list in raw_index.items():
        if isinstance(doc_list, list):
            index[token] = {doc_id: [1] for doc_id in doc_list}
        else:
            index[token] = doc_list
    return index

In [6]:
def load_all_indexes(index_dir):
    """Load all inverted indexes from JSON files in a directory.

    Args:
        index_dir (str): The directory path containing the index JSON files.

    Returns:
        dict: A dictionary where keys are field names and values are inverted indexes.
    """
    indexes = {}
    for file in os.listdir(index_dir):
        if file.endswith("_index.json"):
            field = file.replace("_index.json", "")
            indexes[field] = load_index(field)
    return indexes

In [7]:
def load_synonyms(path):
    """Load synonyms from a JSON file.

    Args:
        path (str): The file path to the synonyms JSON file.

    Returns:
        dict: A dictionary of synonyms where keys are words and values are lists of synonyms.
    """
    return load_json(path)

In [8]:
def build_documents_dict(documents_list):
    """Convert raw product JSON documents into a dictionary indexed by URL.

    Args:
        documents_list (list): A list of product dictionaries.

    Returns:
        dict: A dictionary where keys are URLs and values are processed document info.
    """
    documents = {}

    for doc in documents_list:
        doc_id = doc["url"]

        features_text = " ".join(
            f"{k} {v}" for k, v in doc.get("product_features", {}).items()
        )

        full_text = " ".join([
            doc.get("title", ""),
            doc.get("description", ""),
            features_text
        ]).strip()

        documents[doc_id] = {
            "title": doc.get("title", ""),
            "description": doc.get("description", ""),
            "text": full_text,
            "reviews": len(doc.get("product_reviews", []))
        }

    return documents

# Document filtering

In [9]:
# Filtrer les documents : Vérifier si au moins un des tokens de la requête est présent dans les documents indexés
def filter_token(query_tokens, indexes):
    """Filter documents where at least one query token is present in any index.

    Args:
        query_tokens (list): List of query tokens.
        indexes (dict): Dictionary of inverted indexes.

    Returns:
        set: Set of relevant document IDs.
    """
    relevant_docs = set()

    for token in query_tokens:
        for index in indexes.values():
            if token in index:
                relevant_docs.update(index[token])

    return relevant_docs

In [10]:
# Filtrer les documents : Vérifier si tous les tokens de la requête sont présents dans les documents indexés, sauf les stop words
# Renvoie l'ensemble des documents pertinents
def filter_all_tokens(query_tokens, indexes):
    """Filter documents where all query tokens are present in the indexes.

    Args:
        query_tokens (list): List of query tokens.
        indexes (dict): Dictionary of inverted indexes.

    Returns:
        set: Set of relevant document IDs where all tokens are present.
    """
    relevant_docs = None

    for token in query_tokens:
        docs_for_token = set()

        for index in indexes.values():
            if token in index:
                docs_for_token |= set(index[token])
                
        if not docs_for_token:
            return set()

        if relevant_docs is None:
            relevant_docs = docs_for_token
        else:
            relevant_docs &= docs_for_token

    if relevant_docs is None:
        return set()

    return relevant_docs

# Query processing

In [None]:
# nlp pour traiter les querys
def tokenize_normalize_text(text):
    """Tokenize and normalize text by lowercasing, removing stop words, and lemmatizing.

    Args:
        text (str): The input text to process.

    Returns:
        list: List of normalized tokens.
    """
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [None]:
# fonction pour traiter les synonymes dans la query
def expand_query_with_synonyms(tokens, synonyms):
    """Expand query tokens using synonym dictionary.

    Args:
        tokens (list): List of query tokens.
        synonyms (dict): Dictionary of synonyms.

    Returns:
        list: Expanded list of tokens including synonyms.
    """
    expanded = set(tokens)
    for token in tokens:
        if token in synonyms:
            expanded.update(synonyms[token])
    return list(expanded)

In [None]:
# fonction pour traiter les querys
def process_query(query, stopwords, synonyms):
    """Full query processing pipeline: tokenize, remove stopwords, expand synonyms.

    Args:
        query (str): The search query string.
        stopwords (set): Set of stopwords to remove.
        synonyms (dict): Dictionary of synonyms.

    Returns:
        list: Processed query tokens.
    """
    tokens = tokenize_normalize_text(query)
    tokens = [token for token in tokens if token not in stopwords]
    tokens = expand_query_with_synonyms(tokens, synonyms)
    return tokens

# Ranking function

In [14]:
def compute_idf(token, inverted_index, total_docs):
    """Compute inverse document frequency for a token.

    Args:
        token (str): The token to compute IDF for.
        inverted_index (dict): The inverted index.
        total_docs (int): Total number of documents.

    Returns:
        float: The IDF score.
    """
    df = len(inverted_index.get(token, {}))
    return math.log((total_docs + 1) / (df + 1))

In [None]:
# Fonction pour le score bm25
def bm25_score(doc_id, 
               tokens, 
               inverted_index, 
               doc_lengths, 
               avg_doc_length, 
               k1, 
               b):
    """Compute BM25 score for a document.

    Args:
        doc_id (str): The document ID.
        tokens (list): List of query tokens.
        inverted_index (dict): The inverted index.
        doc_lengths (dict): Dictionary of document lengths.
        avg_doc_length (float): Average document length.
        k1 (float): BM25 parameter k1.
        b (float): BM25 parameter b.

    Returns:
        float: The BM25 score.
    """
    score = 0.0
    for token in tokens:
        if token not in inverted_index:
            continue
        tf = len(inverted_index[token].get(doc_id, []))
        idf = compute_idf(token, inverted_index, len(doc_lengths))
        denom = tf + k1 * (1 - b + b * doc_lengths[doc_id] / avg_doc_length)
        score += idf * ((tf * (k1 + 1)) / (denom + 1e-9))
    return score

In [16]:
def exact_match_score(doc_text, query):
    """Score exact query match in document text.

    Args:
        doc_text (str): The document text.
        query (str): The query string.

    Returns:
        float: 1.0 if query matches exactly, 0.0 otherwise.
    """
    return float(query.lower() in doc_text.lower())

In [17]:
def compute_linear_score(features, weights):
    """Combine multiple features using a linear model.

    Args:
        features (dict): Dictionary of feature names to values.
        weights (dict): Dictionary of feature names to weights.

    Returns:
        float: The combined score.
    """
    return sum(features[name] * weights.get(name, 0.0)
               for name in features)

# Search pipeline

In [18]:
def rank_documents(doc_ids,
                   query_tokens,
                   indexes,
                   metadata,
                   weights):
    """Rank filtered documents using BM25 and other features.

    Args:
        doc_ids (set): Set of document IDs to rank.
        query_tokens (list): List of query tokens.
        indexes (dict): Dictionary of inverted indexes.
        metadata (dict): Metadata including doc_lengths, avg_doc_length, documents.
        weights (dict): Weights for different features.

    Returns:
        list: List of (doc_id, score) tuples, sorted by score descending.
    """
    ranked = []

    for doc_id in doc_ids:
        features = {}

        for field, index in indexes.items():
            features[field] = bm25_score(
                doc_id,
                query_tokens,
                index,
                metadata["doc_lengths"],
                metadata["avg_doc_length"],
                1.5,  # k1
                0.75  # b
            )

        features["exact_match"] = exact_match_score(
            metadata["documents"][doc_id]["text"],
            " ".join(query_tokens)
        )

        features["reviews"] = metadata["documents"][doc_id].get("reviews", 0)

        score = compute_linear_score(features, weights)
        ranked.append((doc_id, score))

    return sorted(ranked, key=lambda x: x[1], reverse=True)

In [19]:
def format_results(ranked_docs,
                   documents,
                   total_docs,
                   filtered_docs,
                   limit,
                   query) -> dict:
    """Format search results as JSON.

    Args:
        ranked_docs (list): List of (doc_id, score) tuples.
        documents (dict): Dictionary of documents.
        total_docs (int): Total number of documents.
        filtered_docs (int): Number of filtered documents.
        limit (int): Maximum number of results to return.
        query (str): The original search query.

    Returns:
        dict: Formatted results with metadata and results list.
    """
    results = []

    for doc_id, score in ranked_docs[:limit]:
        doc = documents[doc_id]
        results.append({
            "title": doc["title"],
            "url": doc_id,
            "description": doc["description"],
            "score": score
        })

    return {
        "query": query,
        "metadata": {
            "total_documents": total_docs,
            "filtered_documents": filtered_docs
        },
        "results": results
    }

In [20]:
def search(query,
           indexes,
           documents,
           stopwords,
           synonyms,
           metadata,
           weights) -> dict:
    """Execute a full search pipeline.

    Args:
        query (str): The search query.
        indexes (dict): Dictionary of inverted indexes.
        documents (dict): Dictionary of documents.
        stopwords (set): Set of stopwords.
        synonyms (dict): Dictionary of synonyms.
        metadata (dict): Metadata for ranking.
        weights (dict): Weights for scoring.

    Returns:
        dict: Search results.
    """
    query_tokens = process_query(query, stopwords, synonyms)

    filtered_docs = filter_all_tokens(query_tokens, indexes)

    if not filtered_docs:
        filtered_docs = filter_token(query_tokens, indexes)

    ranked_docs = rank_documents(
        filtered_docs,
        query_tokens,
        indexes,
        metadata,
        weights
    )

    return format_results(
        ranked_docs,
        documents,
        total_docs=len(documents),
        filtered_docs=len(filtered_docs),
        limit=156,
        query=query
    )

# Metadonnées

In [21]:
def compute_doc_lengths(documents):
    """Compute the length of each document in words.

    Args:
        documents (dict): Dictionary of documents.

    Returns:
        dict: Dictionary of doc_id to document length.
    """
    return {
        doc_id: len(doc["text"].split())
        for doc_id, doc in documents.items()
    }

def compute_avg_doc_length(doc_lengths):
    """Compute the average document length.

    Args:
        doc_lengths (dict): Dictionary of document lengths.

    Returns:
        float: The average document length.
    """
    return sum(doc_lengths.values()) / len(doc_lengths)

# Test

In [None]:
# Jeu de requêtes test
TEST_QUERIES = [
    "chocolate candy",
    "leather sneakers",
    "italy",
    "brazil",
    "timelessfootwear",
    "premium chocolate",
    "comfortable shoes",
    "light up sneaker",
]

In [None]:
indexes = load_all_indexes('input')
documents = load_products(os.path.join('rearranged_products.jsonl'))
documents = build_documents_dict(documents)
synonyms = load_synonyms(os.path.join('input', 'origin_synonyms.json'))


doc_lengths = compute_doc_lengths(documents)

metadata = {
    "doc_lengths": doc_lengths,
    "avg_doc_length": compute_avg_doc_length(doc_lengths),
    "documents": documents
}

weights = {
    "title": 2.0,           # Higher weight for title matches (titles are usually more important)
    "description": 1.0,     # Standard weight for description matches
    "brand": 1.5,           # Slightly higher weight for brand names (brand recognition matters)
    "origin": 0.5,          # Lower weight for country of origin (less relevant for most queries)
    "reviews": 0.1,         # Small weight for review text matches (secondary relevance)
    "bm25": 1.0,            # Standard BM25 score for term frequency and document length normalization
    "exact_match": 1.5      # Higher weight for exact query matches (strong relevance signal)
}


In [24]:
for i, q in enumerate(TEST_QUERIES, 1):
    print("=" * 50)
    print("QUERY:", q)
    results = search(
        query=q,
        indexes=indexes,
        documents=documents,
        stopwords=stop_words,
        synonyms=synonyms,
        metadata=metadata,
        weights=weights
    )
    for r in results["results"][:3]:
        print("-", r["title"], "| score:", r["score"])

    file_name = f"results_query_{i}.json"
    file_path = os.path.join("output", file_name)

    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Fichier existant '{file_name}' supprimé.")

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Résultats sauvegardés dans {file_path}")

QUERY: chocolate candy
- Box of Chocolate Candy | score: 15.754654065901747
- Box of Chocolate Candy | score: 15.632826115582338
- Box of Chocolate Candy - Cherry large | score: 15.572738682030465
Fichier existant 'results_query_1.json' supprimé.
Résultats sauvegardés dans output/results_query_1.json
QUERY: leather sneakers
- Classic Leather Sneakers - White40 | score: 9.575675352223332
- Classic Leather Sneakers - White40 | score: 9.575675352223332
- Classic Leather Sneakers - White41 | score: 9.575675352223332
Fichier existant 'results_query_2.json' supprimé.
Résultats sauvegardés dans output/results_query_2.json
QUERY: light up sneakers
Fichier existant 'results_query_3.json' supprimé.
Résultats sauvegardés dans output/results_query_3.json
QUERY: italy
- web-scraping.dev product page 3 | score: 3.1803649180125158
- web-scraping.dev product page 2 | score: 3.1803649180125158
- Box of Chocolate Candy - Cherry medium | score: 2.9711958428247756
Fichier existant 'results_query_4.json' s