In [133]:
import time

import requests

PAGE_SIZE = 24


def fetch_data(page_count: int):
    result = []
    print("Loading data...")
    for i in range(page_count):
        print(f"Loading page: {i}")
        url = f"https://api.litres.ru/foundation/api/genres/5272/arts/facets?is_for_pda=false&limit={PAGE_SIZE}&o=popular&offset={i * PAGE_SIZE}&show_unavailable=false"
        response = requests.get(url).json()['payload']['data']
        result.extend(response)
        print(f"Page loaded: {i}")
        time.sleep(2)

    return result


litres_fetched_data = fetch_data(page_count=40)

Loading data...
Loading page: 0
Page loaded: 0
Loading page: 1
Page loaded: 1
Loading page: 2
Page loaded: 2
Loading page: 3
Page loaded: 3
Loading page: 4
Page loaded: 4
Loading page: 5
Page loaded: 5
Loading page: 6
Page loaded: 6
Loading page: 7
Page loaded: 7
Loading page: 8
Page loaded: 8
Loading page: 9
Page loaded: 9
Loading page: 10
Page loaded: 10
Loading page: 11
Page loaded: 11
Loading page: 12
Page loaded: 12
Loading page: 13
Page loaded: 13
Loading page: 14
Page loaded: 14
Loading page: 15
Page loaded: 15
Loading page: 16
Page loaded: 16
Loading page: 17
Page loaded: 17
Loading page: 18
Page loaded: 18
Loading page: 19
Page loaded: 19
Loading page: 20
Page loaded: 20
Loading page: 21
Page loaded: 21
Loading page: 22
Page loaded: 22
Loading page: 23
Page loaded: 23
Loading page: 24
Page loaded: 24
Loading page: 25
Page loaded: 25
Loading page: 26
Page loaded: 26
Loading page: 27
Page loaded: 27
Loading page: 28
Page loaded: 28
Loading page: 29
Page loaded: 29
Loading page: 

In [134]:
LITRES_URL = 'https://www.litres.ru'


def fetch_page_html(url):
    print(f"Loading html {url}")
    time.sleep(2)
    return requests.get(LITRES_URL + url).text


id_to_page_html = {}

In [193]:
id_to_api_data = {}

for element in litres_fetched_data:
    id_to_api_data[element['id']] = element

len(id_to_api_data)

959

In [194]:
i = 0
pages_loaded = 0
while len(id_to_page_html) < len(id_to_api_data) and pages_loaded < 50:
    element = litres_fetched_data[i]
    print(f"{len(id_to_page_html)}/{len(litres_fetched_data)} ")
    print(f"Pages loaded: {pages_loaded}")
    i += 1
    if element['id'] in id_to_page_html:
        continue
    else:
        id_to_page_html[element['id']] = fetch_page_html(element['url'])
        pages_loaded += 1


In [195]:
f"{len(id_to_page_html)}/{len(id_to_api_data)}"

'959/959'

In [190]:
from bs4 import BeautifulSoup


def parse_page(html_str: str):
    html = BeautifulSoup(html_str)
    page_count_block = html.select_one('div[data-testid="book-volume__wrapper"] > p:first-child').text
    if page_count_block.startswith("Объем"):
        page_count = int(page_count_block.split(' ')[1])
    else:
        page_count = None

    reviews_count = int(html.select_one('div[data-testid="book-factoids__reviews"] > div > span').text)
    year_block = html.select_one('div:has(> div:has(> span:contains("Дата написания"))) > span')
    if year_block is not None:
        year = int(year_block.text)
    else:
        year = None

    return {
        'page_count': page_count,
        'reviews_count': reviews_count,
        'year': year
    }


id_to_page_data = {}

for id, html in id_to_page_html.items():
    id_to_page_data[id] = parse_page(html)

id_to_page_data

{70920895: {'page_count': 319, 'reviews_count': 1, 'year': 2022},
 39100996: {'page_count': 640, 'reviews_count': 11, 'year': 2017},
 67193183: {'page_count': 304, 'reviews_count': 9, 'year': 2020},
 68998912: {'page_count': 875, 'reviews_count': 6, 'year': 2021},
 69188950: {'page_count': 102, 'reviews_count': 37, 'year': 2023},
 70323379: {'page_count': 192, 'reviews_count': 11, 'year': 2024},
 6444478: {'page_count': 464, 'reviews_count': 19, 'year': None},
 68294027: {'page_count': 259, 'reviews_count': 2, 'year': 2023},
 66738078: {'page_count': 816, 'reviews_count': 3, 'year': 2015},
 70870268: {'page_count': 1056, 'reviews_count': 0, 'year': 2022},
 40932461: {'page_count': 983, 'reviews_count': 15, 'year': 2023},
 67893687: {'page_count': 416, 'reviews_count': 5, 'year': 2021},
 70927897: {'page_count': 485, 'reviews_count': 0, 'year': 2024},
 70388320: {'page_count': 280, 'reviews_count': 8, 'year': 2024},
 50445630: {'page_count': 544, 'reviews_count': 6, 'year': 2019},
 6943

In [122]:
REVIEWS_URL = 'https://api.litres.ru/foundation/'


def fetch_reviews(id: str):
    print(f"Fetching reviews for {id}...")
    limit = 10
    result = []
    url = f"{REVIEWS_URL}/api/arts/{id}/reviews?&o=popular&limit={limit}&o=popular"
    while True:
        reviews_response = requests.get(url).json()['payload']
        result.extend(map(lambda review: BeautifulSoup(review['text']).text, reviews_response['data']))
        print(f"Fetched {len(result)} reviews")

        if reviews_response['pagination']['next_page'] is None:
            return result
        url = f"{REVIEWS_URL}{reviews_response['pagination']['next_page']}"
        time.sleep(1)


id_to_reviews = {}

In [196]:
i = 0
pages_loaded = 0
while len(id_to_reviews) < len(id_to_api_data) and pages_loaded < 100:
    element = litres_fetched_data[i]
    i += 1
    print(f"{len(id_to_reviews)}/{len(litres_fetched_data)} ")
    print(f"Reviews loaded: {pages_loaded}")
    if element['id'] in id_to_reviews:
        continue
    else:
        id_to_reviews[element['id']] = fetch_reviews(element['url'])
        time.sleep(0.5)
        pages_loaded += 1

In [203]:
def format_data(api_data, page_data, reviews):
    main_author = api_data['persons'][0]
    return {
        'name': api_data['title'],
        'author': main_author['full_name'],
        'link': api_data['url'],
        'rating': api_data['rating']['rated_avg'],
        'rating_count': api_data['rating']['rated_total_count'],
        'review_count': page_data['reviews_count'],
        'pages_count': page_data['page_count'],
        'price': api_data['prices']['final_price'],
        'text_reviews': reviews,
        'age': str(api_data['min_age']) + '+',
        'year': page_data['year'],
    }

def join_with_dicts(element, id_to_page_data, id_to_reviews): 
    id = element['id']
    if id not in id_to_page_data or id not in id_to_reviews: 
        return None
    return format_data(element, id_to_page_data[id], id_to_reviews[id])



formated_data = list(map(lambda element: join_with_dicts(element, id_to_page_data, id_to_reviews), id_to_api_data.values()))
len(formated_data)

959

In [206]:
import pandas as pd

df = pd.DataFrame.from_records(formated_data)
df.to_csv("books_data.csv")