In [None]:
import requests
import pandas as pd 
from urllib.parse import urlparse, urljoin, parse_qs
import json
import os
import spacy

In [None]:
spacy.cli.download("en_core_web_sm")

In [58]:
data = pd.read_json('input/products.jsonl', lines=True)
urls = data['url'].tolist()
data.head(5)

Unnamed: 0,url,title,description,product_features,links,product_reviews
0,https://web-scraping.dev/products,web-scraping.dev product page 1,,{},"[https://web-scraping.dev/, https://web-scrapi...",[]
1,https://web-scraping.dev/product/1,Box of Chocolate Candy,Indulge your sweet tooth with our Box of Choco...,"{'material': 'Premium quality chocolate', 'fla...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-22', 'id': 'chocolate-candy..."
2,https://web-scraping.dev/product/16,Red Energy Potion,"Elevate your game with our 'Red Potion', an ex...","{'flavor': 'Intense berry fusion', 'caffeine_c...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2023-02-10', 'id': 'red-potion-1', ..."
3,https://web-scraping.dev/product/10,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s..."
4,https://web-scraping.dev/product/10?variant=bl...,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s..."


In [10]:
# Extraitre les informations de chaque URL : ID produit (numéro après la tld) et Variante (si présente)
# url exemple : https://web-scraping.dev/product/1?variant=orange-small
def extract_info_url(urls):
    info_list = []
    for url in urls:
        parsed_url = urlparse(url)
        query = parse_qs(parsed_url.query)
        path_parts = parsed_url.path.split('/')
        product_id = path_parts[2] if len(path_parts) > 2 else None
        variant = query.get('variant', [None])[0]
        info_list.append({'url': url, 'product_id': product_id, 'variant': variant})
    return info_list

In [63]:
extract_info_url(urls)

[{'url': 'https://web-scraping.dev/products',
  'product_id': None,
  'variant': None},
 {'url': 'https://web-scraping.dev/product/1',
  'product_id': '1',
  'variant': None},
 {'url': 'https://web-scraping.dev/product/16',
  'product_id': '16',
  'variant': None},
 {'url': 'https://web-scraping.dev/product/10',
  'product_id': '10',
  'variant': None},
 {'url': 'https://web-scraping.dev/product/10?variant=blue-5',
  'product_id': '10',
  'variant': 'blue-5'},
 {'url': 'https://web-scraping.dev/product/10?variant=blue-6',
  'product_id': '10',
  'variant': 'blue-6'},
 {'url': 'https://web-scraping.dev/product/10?variant=red-5',
  'product_id': '10',
  'variant': 'red-5'},
 {'url': 'https://web-scraping.dev/product/10?variant=red-6',
  'product_id': '10',
  'variant': 'red-6'},
 {'url': 'https://web-scraping.dev/product/11',
  'product_id': '11',
  'variant': None},
 {'url': 'https://web-scraping.dev/product/11?variant=black40',
  'product_id': '11',
  'variant': 'black40'},
 {'url': 'h

In [50]:
# Un peu de NLP
def remove_stopwords_punctuation(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    return tokens

In [43]:
remove_stopwords_punctuation("web-scraping.dev")

['web-scraping.dev']

In [98]:
# Trouver la position des mots dans un texte
def word_positions(tokens):
    positions = {}
    for index, token in enumerate(tokens):
        if token not in positions:
            positions[token] = []
        positions[token].append(index)
    return positions

In [99]:
# Création des index inversés pour les champs 'title' et 'description'
def create_inverted_index(data, field):
    index = {}

    for url, text in zip(data['url'], data[field]):
        tokens = remove_stopwords_punctuation(text)
        positions = word_positions(tokens)
        for token in tokens:
            if token not in index:
                index[token] = {}
            index[token][url] = positions[token]

    return index

In [100]:
index_title = create_inverted_index(data, 'title')
index_description = create_inverted_index(data, 'description')

In [101]:
index_description

{'indulge': {'https://web-scraping.dev/product/1': [0],
  'https://web-scraping.dev/product/13': [0],
  'https://web-scraping.dev/product/13?variant=cherry-large': [0],
  'https://web-scraping.dev/product/13?variant=cherry-medium': [0],
  'https://web-scraping.dev/product/13?variant=cherry-small': [0],
  'https://web-scraping.dev/product/13?variant=orange-large': [0],
  'https://web-scraping.dev/product/13?variant=orange-medium': [0],
  'https://web-scraping.dev/product/13?variant=orange-small': [0],
  'https://web-scraping.dev/product/1?variant=cherry-large': [0],
  'https://web-scraping.dev/product/1?variant=cherry-medium': [0],
  'https://web-scraping.dev/product/1?variant=cherry-small': [0],
  'https://web-scraping.dev/product/1?variant=orange-large': [0],
  'https://web-scraping.dev/product/1?variant=orange-medium': [0],
  'https://web-scraping.dev/product/1?variant=orange-small': [0],
  'https://web-scraping.dev/product/25': [0],
  'https://web-scraping.dev/product/25?variant=che

In [59]:
# Création des index pour le reviews
def create_reviews_index(data):
    index = {}

    for url, reviews in zip(data['url'], data['product_reviews']):
        total_reviews = len(reviews)
        mean_mark = sum(review['rating'] for review in reviews) / total_reviews if total_reviews > 0 else 0
        last_rating = reviews[-1]['rating'] if total_reviews > 0 else None
        index[url] = {
            'total_reviews': total_reviews,
            'mean_mark': mean_mark,
            'last_rating': last_rating
        }
    
    return index

In [62]:
index_reviews = create_reviews_index(data)
index_reviews

{'https://web-scraping.dev/products': {'total_reviews': 0,
  'mean_mark': 0,
  'last_rating': None},
 'https://web-scraping.dev/product/1': {'total_reviews': 5,
  'mean_mark': 4.6,
  'last_rating': 4},
 'https://web-scraping.dev/product/16': {'total_reviews': 4,
  'mean_mark': 4.75,
  'last_rating': 5},
 'https://web-scraping.dev/product/10': {'total_reviews': 5,
  'mean_mark': 4.2,
  'last_rating': 4},
 'https://web-scraping.dev/product/10?variant=blue-5': {'total_reviews': 5,
  'mean_mark': 4.2,
  'last_rating': 4},
 'https://web-scraping.dev/product/10?variant=blue-6': {'total_reviews': 5,
  'mean_mark': 4.2,
  'last_rating': 4},
 'https://web-scraping.dev/product/10?variant=red-5': {'total_reviews': 5,
  'mean_mark': 4.2,
  'last_rating': 4},
 'https://web-scraping.dev/product/10?variant=red-6': {'total_reviews': 5,
  'mean_mark': 4.2,
  'last_rating': 4},
 'https://web-scraping.dev/product/11': {'total_reviews': 5,
  'mean_mark': 4.4,
  'last_rating': 5},
 'https://web-scraping.de

In [104]:
# Création des index inversés pour les features comme marque et origine.
def create_features_index(data, feature):
    index = {}

    for url, features in zip(data['url'], data['product_features']):
        feature_value = features.get(feature)
        feature_value = remove_stopwords_punctuation(feature_value) if feature_value else []
        for token in feature_value:
            if token not in index:
                index[token] = set()
            index[token].add(url)

    return index

In [103]:
# index_origin = create_features_index(data, 'made in')
index_brand = create_features_index(data, 'brand')
#index_material = create_features_index(data, 'material')


KeyboardInterrupt: 