In [140]:
import json
import logging
import os
import time
from random import randint
from string import Template

import requests

logging.basicConfig(level=logging.INFO)


In [99]:
FILTER_PAGE_FILE="./data/base/filter_page.json"

os.makedirs("./data/base", exist_ok=True)
os.makedirs("./data/counts", exist_ok=True)
os.makedirs("./data/search-results", exist_ok=True)

In [78]:
def get_and_save(url, file_name="./data/tmp.json"):
  headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
  }
  json_response = requests.get(url, headers=headers).json()

  with open(file_name, "w") as output_file:
      output_file.write(json.dumps(json_response, ensure_ascii=False))


In [79]:
def print_dict(dict_obj):
    print(json.dumps(dict_obj, indent=2, ensure_ascii=False))

In [80]:
def get_options_of_filter_page_item(filter_page_item_name):
    with open(FILTER_PAGE_FILE, "r") as f:
        filter_page = json.load(f)

    return list(
        filter(
            lambda x: x["name"] == filter_page_item_name,
            filter_page["result"]["sections"][0]["elements"],
        )
    )[0]["options"]

In [91]:
def get_manufacturer_cb_by_model_cb(manufacturer_cb_options, manufacturer_cb_id):
    return list(
        filter(
            lambda x: x["id"] == manufacturer_cb_id,
            manufacturer_cb_options,
        )
    )[0]


In [None]:
# Fetch filter page parameters, manufacturer_cb options included

filter_page = "https://www.sauto.cz/api/v1/items/filter_page?category_id=838&operating_lease=false"
get_and_save(filter_page, FILTER_PAGE_FILE)

In [None]:
# Fetch models by all manufacturer

url_models_mapping_template = Template(
    "https://www.sauto.cz/api/v1/codebooks/values/cached?parent_id=$parent_id"
)

manufacturer_cb_options = get_options_of_filter_page_item("manufacturer_cb")

for manufacturer_cb_option in manufacturer_cb_options:
    parent_id = manufacturer_cb_option["id"]
    seo_name = manufacturer_cb_option["seo_name"]
    get_and_save(
        url_models_mapping_template.substitute(parent_id=parent_id),
        f"./data/base/model_cb-{parent_id}-{seo_name}.json",
    )

In [None]:
# Fetch counts per manufacturer

url = Template(
    "https://www.sauto.cz/api/v1/items/search/counts?manufacturer_model_seo=$manufacturer_model_seo&condition_seo=nove%2Cojete%2Cpredvadeci&category_id=838&filter_name=model_cb&operating_lease=false"
)

manufacturer_cb_options = get_options_of_filter_page_item("manufacturer_cb")

for manufacturer_cb_option in manufacturer_cb_options:
    manufacturer_model_seo = manufacturer_cb_option["seo_name"]
    get_and_save(
        url.substitute(manufacturer_model_seo=manufacturer_model_seo),
        f"./data/counts/{manufacturer_model_seo}.json",
    )

In [129]:
# Returns number of ads per manufacturer_cb and model_cb combination


def get_ads_count(manufacturer_cb, model) -> int:
    with open(f'./data/counts/{manufacturer_cb["seo_name"]}.json', "r") as f:
        counts = json.load(f)
        return counts["result"]["advert_counts"].get(str(model["value"]), 0)

In [138]:
def fetch_all_ads(manufacturer_seo, model_seo, ads_count, batch_size=100):
    url = Template(
        "https://www.sauto.cz/api/v1/items/search?limit=$limit&offset=$offset&manufacturer_model_seo=$manufacturer_cb%3A$model_cb&condition_seo=nove%2Cojete%2Cpredvadeci&category_id=838&operating_lease=false&timestamp_to=$timestamp_to"
    )

    for offset in range(0, ads_count, batch_size):
        url_to_search = url.substitute(
            limit=batch_size,
            offset=offset,
            manufacturer_cb=manufacturer_seo,
            model_cb=model_seo,
            timestamp_to=int(time.time()),
        )
        get_and_save(
            url_to_search,
            f"./data/search-results/{manufacturer_seo}-{model_seo}-{offset}.json",
        )


In [None]:
manufacturer_cb_options = get_options_of_filter_page_item("manufacturer_cb")

files = [
    filename
    for filename in os.listdir("./data/base")
    if filename.startswith("model_cb")
]

for file in files:
    with open("./data/base/" + file, "r") as f:
        models = json.load(f)["results"]
        manufacturer_cb = get_manufacturer_cb_by_model_cb(
            manufacturer_cb_options, models[0]["parent"]["id"]
        )

        for model in models:
            ads_count = get_ads_count(manufacturer_cb, model)
            if ads_count > 0:
                fetch_all_ads(
                    manufacturer_seo=manufacturer_cb["seo_name"],
                    model_seo=model["seo_name"],
                    ads_count=ads_count,
                )
    time.sleep(0.0244 * randint(5, 14))
