In [88]:
import requests
import pandas as pd 
from urllib.parse import urlparse, urljoin, parse_qs
import json
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import math

In [89]:
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /home/ensai/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ensai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ensai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ensai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Data loading

In [90]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [91]:
def load_products(path):
    products = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            products.append(json.loads(line))
    return products

In [92]:
# Charger les index depuis des fichiers dans le dossier input
def load_index(index_name):
    index_path = os.path.join('input', f'{index_name}_index.json')
    with open(index_path, 'r') as f:
        raw_index = json.load(f)
    
    index = {}
    for token, doc_list in raw_index.items():
        if isinstance(doc_list, list):
            index[token] = {doc_id: [1] for doc_id in doc_list}
        else:
            index[token] = doc_list
    return index

In [93]:
def load_all_indexes(index_dir):
    indexes = {}
    for file in os.listdir(index_dir):
        if file.endswith("_index.json"):
            field = file.replace("_index.json", "")
            indexes[field] = load_index(field)
    return indexes

In [94]:
def load_synonyms(path):
    """
    Load synonyms from a JSON file.
    """
    return load_json(path)

In [95]:
def build_documents_dict(documents_list):
    """
    Convert raw product JSON documents into a dictionary indexed by URL.
    """
    documents = {}

    for doc in documents_list:
        doc_id = doc["url"]

        features_text = " ".join(
            f"{k} {v}" for k, v in doc.get("product_features", {}).items()
        )

        full_text = " ".join([
            doc.get("title", ""),
            doc.get("description", ""),
            features_text
        ]).strip()

        documents[doc_id] = {
            "title": doc.get("title", ""),
            "description": doc.get("description", ""),
            "text": full_text,
            "reviews": len(doc.get("product_reviews", []))
        }

    return documents

# Document filtering

In [96]:
# Filtrer les documents : Vérifier si au moins un des tokens de la requête est présent dans les documents indexés
def filter_token(query_tokens, indexes):
    relevant_docs = set()

    for token in query_tokens:
        for index in indexes.values():
            if token in index:
                relevant_docs.update(index[token])

    return relevant_docs

In [97]:
# Filtrer les documents : Vérifier si tous les tokens de la requête sont présents dans les documents indexés, sauf les stop words
# Renvoie l'ensemble des documents pertinents
def filter_all_tokens(query_tokens, indexes):
    relevant_docs = None

    for token in query_tokens:
        docs_for_token = set()

        for index in indexes.values():
            if token in index:
                docs_for_token |= set(index[token])
                
        if not docs_for_token:
            return set()

        if relevant_docs is None:
            relevant_docs = docs_for_token
        else:
            relevant_docs &= docs_for_token

    if relevant_docs is None:
        return set()

    return relevant_docs


# Query processing

In [98]:
def tokenize_normalize_text(text):
    """
    Tokenize and normalize text.
    """
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [99]:
def expand_query_with_synonyms(tokens, synonyms):
    """
    Expand query tokens using synonym dictionary.
    """
    expanded = set(tokens)
    for token in tokens:
        if token in synonyms:
            expanded.update(synonyms[token])
    return list(expanded)

In [100]:
def process_query(query, stopwords, synonyms):
    """
    Full query processing pipeline.
    """
    tokens = tokenize_normalize_text(query)
    tokens = [token for token in tokens if token not in stopwords]
    tokens = expand_query_with_synonyms(tokens, synonyms)
    return tokens

# Ranking function

In [101]:
def compute_idf(token, inverted_index, total_docs):
    """
    Compute inverse document frequency.
    """
    df = len(inverted_index.get(token, {}))
    return math.log((total_docs + 1) / (df + 1))

In [102]:
def bm25_score(doc_id, tokens, inverted_index, doc_lengths, avg_doc_length, k1, b):
    """
    Compute BM25 score for a document.
    """
    score = 0.0
    for token in tokens:
        if token not in inverted_index:
            continue
        tf = len(inverted_index[token].get(doc_id, []))
        idf = compute_idf(token, inverted_index, len(doc_lengths))
        denom = tf + k1 * (1 - b + b * doc_lengths[doc_id] / avg_doc_length)
        score += idf * ((tf * (k1 + 1)) / (denom + 1e-9))
    return score

In [103]:
def exact_match_score(doc_text, query):
    """
    Score exact query match.
    """
    return float(query.lower() in doc_text.lower())

In [104]:

def compute_linear_score(features, weights):
    """
    Combine multiple features using a linear model.
    """
    return sum(features[name] * weights.get(name, 0.0)
               for name in features)

# Search pipeline

In [105]:
def rank_documents(doc_ids,
                   query_tokens,
                   indexes,
                   metadata,
                   weights):
    """
    Rank filtered documents.
    """
    ranked = []

    for doc_id in doc_ids:
        features = {}

        for field, index in indexes.items():
            features[field] = bm25_score(
                doc_id,
                query_tokens,
                index,
                metadata["doc_lengths"],
                metadata["avg_doc_length"],
                1.5,  # k1
                0.75  # b
            )

        features["exact_match"] = exact_match_score(
            metadata["documents"][doc_id]["text"],
            " ".join(query_tokens)
        )

        features["reviews"] = metadata["documents"][doc_id].get("reviews", 0)

        score = compute_linear_score(features, weights)
        ranked.append((doc_id, score))

    return sorted(ranked, key=lambda x: x[1], reverse=True)

In [106]:
def format_results(ranked_docs,
                   documents,
                   total_docs,
                   filtered_docs,
                   limit) -> dict:
    """
    Format search results as JSON.
    """
    results = []

    for doc_id, score in ranked_docs[:limit]:
        doc = documents[doc_id]
        results.append({
            "title": doc["title"],
            "url": doc_id,
            "description": doc["description"],
            "score": score
        })

    return {
        "metadata": {
            "total_documents": total_docs,
            "filtered_documents": filtered_docs
        },
        "results": results
    }

In [112]:
def search(query,
           indexes,
           documents,
           stopwords,
           synonyms,
           metadata,
           weights) -> dict:
    """
    Execute a full search pipeline.
    """
    query_tokens = process_query(query, stopwords, synonyms)

    filtered_docs = filter_all_tokens(query_tokens, indexes)

    if not filtered_docs:
        filtered_docs = filter_token(query_tokens, indexes)

    ranked_docs = rank_documents(
        filtered_docs,
        query_tokens,
        indexes,
        metadata,
        weights
    )

    return format_results(
        ranked_docs,
        documents,
        total_docs=len(documents),
        filtered_docs=len(filtered_docs),
        limit=10
    )

# Metadonnées

In [113]:
def compute_doc_lengths(documents):
    return {
        doc_id: len(doc["text"].split())
        for doc_id, doc in documents.items()
    }

def compute_avg_doc_length(doc_lengths):
    return sum(doc_lengths.values()) / len(doc_lengths)

# Test

In [114]:
# Jeu de requêtes test
TEST_QUERIES = [
    "chocolate candy",
    "leather sneakers",
    "light up sneakers",
    "italy",
    "brazil",
    "timelessfootwear",
    "premium chocolate",
    "comfortable shoes"
]

In [115]:
indexes = load_all_indexes('input')
documents = load_products(os.path.join('rearranged_products.jsonl'))
documents = build_documents_dict(documents)
synonyms = load_synonyms(os.path.join('input', 'origin_synonyms.json'))


doc_lengths = compute_doc_lengths(documents)

metadata = {
    "doc_lengths": doc_lengths,
    "avg_doc_length": compute_avg_doc_length(doc_lengths),
    "documents": documents
}

weights = {
    "title": 2.0,
    "description": 1.0,
    "brand": 1.5,
    "origin": 0.5,
    "reviews": 0.1,
    "bm25": 1.0,
    "exact_match": 1.5
}


In [None]:
for i, q in enumerate(TEST_QUERIES, 1):
    print("=" * 50)
    print("QUERY:", q)
    results = search(
        query=q,
        indexes=indexes,
        documents=documents,
        stopwords=stop_words,
        synonyms=synonyms,
        metadata=metadata,
        weights=weights
    )
    for r in results["results"][:3]:
        print("-", r["title"], "| score:", r["score"])

    file_name = f"results_query_{i}.json"
    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"Résultats sauvegardés dans {file_name}")

QUERY: chocolate candy
- Box of Chocolate Candy | score: 14.254654065901747
- Box of Chocolate Candy | score: 14.132826115582338
- Box of Chocolate Candy - Cherry large | score: 14.072738682030465
Résultats sauvegardés dans results_query_1.json
QUERY: leather sneakers
- Classic Leather Sneakers - White40 | score: 8.075675352223332
- Classic Leather Sneakers - White41 | score: 8.075675352223332
- Classic Leather Sneakers - White42 | score: 8.075675352223332
Résultats sauvegardés dans results_query_2.json
QUERY: light up sneakers
Résultats sauvegardés dans results_query_3.json
QUERY: italy
- web-scraping.dev product page 3 | score: 3.1803649180125158
- web-scraping.dev product page 2 | score: 3.1803649180125158
- Box of Chocolate Candy - Cherry small | score: 2.9711958428247756
Résultats sauvegardés dans results_query_4.json
QUERY: brazil
- web-scraping.dev product page 2 | score: 4.159520662218369
- Box of Chocolate Candy | score: 3.6592540710893218
- Red Energy Potion - One | score: 3.

In [None]:
search_resault = format_results()