In [3]:
import re

from aiohttp.pytest_plugin import aiohttp_client
from bs4 import BeautifulSoup

from constants import HEADERS

# WRITE YOUR PROMPT HERE

sentence = "I want yellow sofas, show me the cheapest ones first"

In [4]:
import pandas as pd
from pydantic import BaseModel


class Filters(BaseModel):
    """All units are in centimetres"""
    width_min: int | None
    width_max: int | None
    depth_min: int | None
    depth_max: int | None
    height_min: int | None
    height_max: int | None

    first_cheapest: bool = False
    color: str | None = None

    def to_query_params(self):
        params = ""
        if self.width_min:
            params += f"width.min={self.width_min}&"
        if self.width_max:
            params += f"width.max={self.width_max}&"
        if self.depth_min:
            params += f"depth.min={self.depth_min}&"
        if self.depth_max:
            params += f"depth.max={self.depth_max}&"
        if self.height_min:
            params += f"height.min={self.height_min}&"
        if self.height_max:
            params += f"height.max={self.height_max}&"

        if self.first_cheapest:
            params += f"order=PRICE_ASC&"

        if self.color:
            params += f"color={self.color}&"

        return params

In [5]:
import os
from openai import OpenAI

from dotenv import load_dotenv


load_dotenv()


client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv("OPENROUTER_KEY"),
)


completion = client.beta.chat.completions.parse(
    model="google/gemini-flash-1.5",
    messages=[
        {
            "role": "system",
            "content": "Extract parameters from the users sentence. Convert units to cm. integers and put into output. Leave fields as None if not specified"
        },
        {"role": "user", "content": sentence},
    ],
    response_format=Filters,
)


filters = completion.choices[0].message.parsed


In [6]:
print(filters.to_query_params())


order=PRICE_ASC&color=yellow&


In [7]:
from playwright.async_api import async_playwright
import json


limit = 10
offset = 0

headers = {
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Cookie': 'RU=KG,ru-KG;y=0.9;xy=0.8;ky=0.7;en-GB=0.6;en-US=0.5;ru=0.4;en=0.3;H24AB=AIC_POP|B|DY_|C|B|g|A|JH24_tip|(B|reco|2);H24AuthToken=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiVXNlciIsImNvb2tpZSI6W10sImlzVmlzaXRvciI6IjEiLCJpc1Rva2VuUmVmcmVzaCI6ZmFsc2UsImlzTG9nZ2VkSW4iOmZhbHNlLCJpc0JvdCI6ZmFsc2UsImlhdCI6MTcxMjc0MzE3MCwiZXhwIjoxNzEzMzQ3OTcwfQ.lJ3LhWhqMNDx3OC1MTMMTLYvNvhryhylslawW4LCB6MTGYdNywywaWFEqWfkuQpMgMa3qJY9yHryNtvqQlcjVlaWqRTuoPqR_WQqrUaOBTA3ZSUpLtkb3_YdqvMxhyhAD3C1ASCN7o9TvTUhvshnFngKysrLRm7bFEq4Zi7pS2nstMdhk201y3LThcRcypcSwdPed7r1Zys6gdB4sHscO9wZc9uMOTFWn4Q;H24DeviceId=g7n9pMi4M2F4OVSz5vFnhp9hvau2u75d7r8fdhyRfemz7CBnuaV7i250410;h24experience=94648;ConstructorId_client_id=513427b4-8ee9-4575-9be6-73f6b439b08;UserCentricsConsent=marketing|functional|essential|_ga.1.1305676892.1744291054;_ga=GA1.3.15042609.1744291055;FPAuth=1.1.1305676892.1744291054._pan_unauth=0vvkPU5EZqXPR0o2TjZfXfIRhRzhtZAWtURcakuqJzRVE!.(Wnqmru16XU0eml3TYtY1obg;sc0=_;mibihvr-anon=1744278932-904399376.-10157;',
    'Newrelic': 'eyJ2ZXJzaW9uIjoyLCJhY3Rpb24iOiJvdmVydmlldyIsIm5hbWUiOiJUZXN0IFRyYW5zYWN0aW9uIiwiYXV0aCI6bnVsbCwiY2hhbm5lbCI6ImRldmVsX21vYmlsZV93ZWIiLCJjdXN0b21QYXJhbXMiOnsiY3VzdG9tZXJJZCI6IjEifX0=',
    'Priority': 'u=1, i',
    'Referer': 'https://www.home24.de/kueche-moebel/',
    'Sec-Ch-Ua': '"Chromium";v="134", "Not.A/Brand";v="24", "Google Chrome";v="134"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Linux"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Traceparent': '00-1370999544ed73777825c2d47e504f30-cbea793045573f-01',
    'Tracestate': '277059fb60-e7-1-2770595-536644253-cbea793045573f-1744386556688',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
    'X-NewRelic-ID': 'VgEAVVdRQhADUVVfAgYDVlE=',
    'X-Source-TraceId': '0dd30e95811ea5cbe967a0a3d150f1',
}

category_to_id = {
    "sofa-couch": "156318",
    "kueche-moebel": "156182",
    "innenleuchte": "177561"
}

variables = {
  "urlParams": filters.to_query_params(),
  "id": category_to_id["sofa-couch"],
  "locale": "de_DE",
  "first": limit,
  "offset": offset,
  "format": "WEBP",
  "userIP": "91.247.57.116",
  "userAgent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
  "thirdPartyClientId": "513f27b4-8ee9-4575-9be6-73fde4f39cd8",
  "thirdPartySessionId": "3",
  "backend": "ThirdParty"
}

extensions = {
  "persistedQuery": {
    "version": 1,
    "sha256Hash": "80021581f9cced0ece4065bd9c99d50a4e8cdfe749ab57cea87ef0a549bfb7b7"
  }
}

compact_variables = json.dumps(variables, separators=(',', ':'))
compact_extensions = json.dumps(extensions, separators=(',', ':'))


import requests
from urllib.parse import quote

BASE_URL = "https://www.home24.de"
url = f"{BASE_URL}/graphql?extensions={quote(compact_extensions)}&variables={quote(compact_variables)}"


response = requests.get(url,headers=headers,)

print(response.status_code)



200


In [12]:
from pprint import pprint

pprint(response.json()["data"]["categories"][0]["categoryArticles"]["articles"][0])

{'bestOffer': {'isHome24': True, 'shop': {'id': '2000', 'name': 'home24 SE'}},
 'brand': {'name': 'loftscape'},
 'breadcrumbs': [{'name': 'Startseite', 'url': '/'},
                 {'name': 'Möbel', 'url': 'moebel-sortiment/'},
                 {'name': 'Sofas & Couches', 'url': 'sofa-couch/'},
                 {'name': 'Sofa-Zubehör', 'url': 'sofa-zubehoer/'}],
 'campaigns': [{'key': 'xgspx-freedelivery-0425',
                'label': '0 € VERSAND',
                'voucherBanner_v2': None},
               {'key': 'crmeaster', 'label': '-', 'voucherBanner_v2': None},
               {'key': 'hc-campaigns', 'label': '-', 'voucherBanner_v2': None},
               {'key': 'influencer', 'label': '-', 'voucherBanner_v2': None},
               {'key': 'nonspecial-mp-reduced',
                'label': '-',
                'voucherBanner_v2': None},
               {'key': 'nonspecial-nonmp',
                'label': '-',
                'voucherBanner_v2': None},
               {'key': 'sr-ca

In [25]:
class Dimensions(BaseModel):
    width: int
    height: int
    depth: int


class Product(BaseModel):
    title: str
    price: float
    image_url: str
    dimensions: Dimensions | None = None
    url: str

products = []

json_data = response.json()
for product in json_data["data"]["categories"][0]["categoryArticles"]["articles"]:
    product_obj = Product(
        title=product["name"],
        price=product["prices"]["regular"]["value"],
        image_url=product["images"][0]["path"],
        url=BASE_URL + "/" + product["url"],
    )
    products.append(product_obj)


In [10]:
import pandas as pd

df = pd.DataFrame([p.__dict__ for p in products])
df

Unnamed: 0,title,price,image_url,url
0,Kopfstütze Heimari,7799.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/kopfstuetze-heim...
1,Kopfstütze Tanete,8999.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/kopfstuetze-tane...
2,Fußhocker Merida,14200.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/fusshocker-merid...
3,2 Sitzer Sofa 83B-234V01YL,15190.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/2-sitzer-sofa-83...
4,2-Sitzer Sofa Stoff Gelb,15299.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/2-sitzer-sofa-st...
5,Rückenlehne Miu Magic S,15499.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/rueckenlehne-miu...
6,Sessel mit Kissen XT2265,16999.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/sessel-mit-kisse...
7,2-Sitzer-Sofa XT7290,18799.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/2-sitzer-sofa-xt...
8,Sofa 3002820-1,19099.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/sofa-3002820-1-g...
9,Rückenlehne Miu Magic L,19499.0,https://cdn1.home24.net/images/media/catalog/p...,https://www.home24.de/produkt/rueckenlehne-miu...


In [11]:
# graphql query


{"persistedQuery":{"version":1,"sha256Hash":"80021581f9cced0ece4065bd9c99d50a4e8cdfe749ab57cea87ef0a549bfb7b7"}}


{'persistedQuery': {'version': 1,
  'sha256Hash': '80021581f9cced0ece4065bd9c99d50a4e8cdfe749ab57cea87ef0a549bfb7b7'}}

In [27]:
import asyncio
import aiohttp
from constants import HEADERS
from bs4 import BeautifulSoup
import re


async def set_dimension(product: Product):
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(product.url) as response:
            html = await response.read()

    data = extract_dimensions(html)
    product.dimensions = Dimensions(**data)


def extract_dimensions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    dimensions = {}

    # Define the mapping from German labels to English keys
    label_map = {
        "Tiefe": "depth",
        "Höhe": "height",
        "Breite": "width"
    }

    # Find all the divs containing individual dimension info
    dimension_blocks = soup.find_all('div', class_='e1kn6ntn3')

    if not dimension_blocks:
         dimension_blocks = soup.find_all('div', class_='emotion-cache-h7y6ra')

    for block in dimension_blocks:
        label_div = block.find('div', class_='e1kn6ntn4')
        value_div = block.find('div', class_='e1kn6ntn5')

        if label_div and value_div:
            label_text = label_div.get_text(strip=True)
            value_text = value_div.get_text(strip=True) # e.g., "173 cm"

            if label_text in label_map:
                # --- Modification Start ---
                # Extract only the digits using regex
                match = re.search(r'\d+', value_text)
                if match:
                    try:
                        numeric_value = int(match.group(0))
                        dimensions[label_map[label_text]] = numeric_value
                    except ValueError:
                        print(f"Warning: Could not convert extracted digits '{match.group(0)}' from '{value_text}' to integer for label '{label_text}'. Skipping.")
                else:
                     print(f"Warning: Could not find numeric value in '{value_text}' for label '{label_text}'. Skipping.")

    return dimensions


await asyncio.gather(*[set_dimension(product) for product in products])



import pandas as pd

df = pd.DataFrame([p.__dict__ for p in products])
df

Unnamed: 0,title,price,image_url,dimensions,url
0,Kopfstütze Heimari,7799.0,https://cdn1.home24.net/images/media/catalog/p...,width=45 height=20 depth=15,https://www.home24.de/produkt/kopfstuetze-heim...
1,Kopfstütze Tanete,8999.0,https://cdn1.home24.net/images/media/catalog/p...,width=50 height=25 depth=12,https://www.home24.de/produkt/kopfstuetze-tane...
2,Fußhocker Merida,14200.0,https://cdn1.home24.net/images/media/catalog/p...,width=40 height=42 depth=40,https://www.home24.de/produkt/fusshocker-merid...
3,2 Sitzer Sofa 83B-234V01YL,15190.0,https://cdn1.home24.net/images/media/catalog/p...,width=62 height=85 depth=110,https://www.home24.de/produkt/2-sitzer-sofa-83...
4,2-Sitzer Sofa Stoff Gelb,15299.0,https://cdn1.home24.net/images/media/catalog/p...,width=114 height=74 depth=114,https://www.home24.de/produkt/2-sitzer-sofa-st...
5,Rückenlehne Miu Magic S,15499.0,https://cdn1.home24.net/images/media/catalog/p...,width=56 height=33 depth=30,https://www.home24.de/produkt/rueckenlehne-miu...
6,Sessel mit Kissen XT2265,16999.0,https://cdn1.home24.net/images/media/catalog/p...,width=77 height=70 depth=71,https://www.home24.de/produkt/sessel-mit-kisse...
7,2-Sitzer-Sofa XT7290,18799.0,https://cdn1.home24.net/images/media/catalog/p...,width=77 height=80 depth=158,https://www.home24.de/produkt/2-sitzer-sofa-xt...
8,Sofa 3002820-1,19099.0,https://cdn1.home24.net/images/media/catalog/p...,width=135 height=76 depth=65,https://www.home24.de/produkt/sofa-3002820-1-g...
9,Rückenlehne Miu Magic L,19499.0,https://cdn1.home24.net/images/media/catalog/p...,width=93 height=33 depth=30,https://www.home24.de/produkt/rueckenlehne-miu...
