In [1]:
from selenium import webdriver
from time import sleep

In [2]:
authors = {
    "Дарья Донцова":  29369,
    "Джеймс Роллинс": 29442,
    "Макс Фрай":      102994,
    "Эрин Хантер":    26149,
    "Дмитрий Емец":   35952
}

In [3]:
def wait_a_sec():
    sleep(2)

In [4]:
books_id = []

with webdriver.Firefox() as driver:
    for author_name, author_id in authors.items():
        url = f"https://www.bookvoed.ru/author/books?id={author_id}"
        print(url)
        driver.get(url)
        wait_a_sec()
        old_height = 0
        while True:
            wait_a_sec()
            new_height = driver.execute_script("return document.body.scrollHeight")
            while old_height < new_height:
                driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
                wait_a_sec()
                old_height = new_height
                new_height = driver.execute_script("return document.body.scrollHeight")
            buttom = driver.find_elements_by_class_name("wy")
            if not buttom:
                break
            else:
                buttom[0].click()
                print("--click")
        
        books = driver.find_elements_by_class_name("gf")
        books_id += list(map(lambda x : x.get_attribute("data-book"), books))

print(f"{len(books_id)} books")

https://www.bookvoed.ru/author/books?id=29369
--click
https://www.bookvoed.ru/author/books?id=29442
https://www.bookvoed.ru/author/books?id=102994
https://www.bookvoed.ru/author/books?id=26149
https://www.bookvoed.ru/author/books?id=35952
1773 books


In [5]:
import requests
from bs4 import BeautifulSoup
import re

In [6]:
age_codes = {
    'ov': '0+',
    'pv': '6+',
    'qv': '12+',
    'rv': '16+',
    'sv': '18+'
}

In [7]:
def get_book_info(book_id):
    url = f"https://www.bookvoed.ru/book?id={book_id}"
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    res = {"ID" : book_id}
    res["Название"] = soup.find("h1", itemprop="name").contents[0].strip()
    res["Обложка"] = soup.find("img", class_="tf").attrs["src"]
    res["Возраст"] = age_codes[soup.find("div", class_="nM").attrs["class"][0]]
    discr = soup.find("div", class_="lw")
    if discr:
        res["Описание"] = re.sub(r'\n', '', soup.find("div", class_="lw").contents[0]).strip()
    res['Рейтинг'] = float(soup.find("div", class_="af").attrs['style'].strip('width: ').rstrip('%'))
    res["Понравилось"] = re.sub(r'\n', '', soup.find("a", class_="Me").text).strip()
    res["В закладки"] = re.sub(r'\n', '', soup.find("a", class_="Le").text).strip()
    res["Не понравилось"] = re.sub(r'\n', '', soup.find("a", class_="Oe").text).strip()
    price = soup.find("div", class_="Hu Wu")
    if price:
        res["Цена"] = float(re.sub(r'[^0-9]', '', soup.find("div", class_="Hu Wu").text))
    for k, v in map(lambda x: x.text.split(":", 1), soup.find_all("tr", class_="uw")):
            res[k] = v
    
    for key in ["Понравилось", "В закладки", "Не понравилось", "Год", "Страниц", "Код", "Тираж"]:
        if key in res:
            if not (res[key] == ""):
                res[key] = int(res[key])
            else:
                res[key] = 0
    
    return res

In [8]:
from multiprocessing import Pool, Lock, Value
import pandas as pd

In [9]:
mutex = Lock()
n_processed = Value('i', 0)

def wrapper(book_id):
    res = get_book_info(book_id)
    with mutex:
        global n_processed
        n_processed.value += 1
        if n_processed.value % 100 == 0:
            print(f"\r{n_processed.value} objects are processed...", flush=True)
    return res

In [10]:
%%time
n_processed.value = 0
with Pool(4) as pool:
    res = pool.map(wrapper, books_id)
    df = pd.DataFrame(res)
    with open('book_info.csv', mode='w', encoding='utf-8') as f:
        df.to_csv(f, index=False)

100 objects are processed...
200 objects are processed...
300 objects are processed...
400 objects are processed...
500 objects are processed...
600 objects are processed...
700 objects are processed...
800 objects are processed...
900 objects are processed...
1000 objects are processed...
1100 objects are processed...
1200 objects are processed...
1300 objects are processed...
1400 objects are processed...
1500 objects are processed...
1600 objects are processed...
1700 objects are processed...
CPU times: user 210 ms, sys: 114 ms, total: 324 ms
Wall time: 4min 20s
