In [65]:
import re
import pandas as pd
import requests
import json
from multiprocessing import Pool
from bs4 import BeautifulSoup
from selenium import webdriver
import html
from time import sleep
from operator import setitem

from collections import defaultdict

In [66]:
authors_id = {
  "Достоевский Ф. М.": 9150,
  "Роллинс Дж.": 59396,
  "Фицджеральд Ф. С.": 28727,
  "Глуховский Д. А.": 53427,
  "Стругацкий А. Н.": 26268,
  "Лукьяненко С. В.": 16626,
  "Фрай М.": 28927,
  "Хантер Э.": 37969,
  "Роулинг Дж. К.": 104832
}
author_url_format = "https://www.moscowbooks.ru/catalog/author/{}"
book_url_format = "https://www.moscowbooks.ru/book/{}"

In [67]:
def get_author_cards(author_name):
    author_id = authors_id[author_name]
    author_cards_total = []
    subpage_count = 1
    while True:
        reload_count = 0
        while reload_count < 5: 
            response = requests.get(author_url_format.format(author_id),\
                                    params={'page': subpage_count})
            if response.status_code == 200:
                response = response.text
                reload_count = 5
            else:
                reload_count += 1
                    
        soup = BeautifulSoup(response, 'lxml')        
        author_cards = [item.attrs['data-productid'] 
        for item in soup.find_all('a', class_='book-preview__buy-button')]
        if author_cards:
            author_cards_total += author_cards
            subpage_count += 1
        else:
            break
    return author_cards_total

In [75]:
books = []
for author in authors_id:
    books += get_author_cards(author)

In [76]:
pull_attributes_int = [
   "Год",
   "Страниц",
   "Код",
   "Тираж"
]

pull_attributes_str = [
    "Серия",
    "Издательство",
    "Переплёт",
    "ISBN",
    "Размеры",
    "Формат",
    "В базе",
    "Автор",
    "Тематика",
    "Переводчик",
    "Производитель"
]

In [77]:
def get_book_info(uid):
    
    book_url = book_url_format.format(uid)
    book_html = requests.get(book_url).text
    soup = BeautifulSoup(book_html, 'html.parser')

    price = soup.find('div', class_='Hu Wu')
    rating = soup.find('div', class_='He xe ')
    stickers = ''
    stickers_container = soup.find('div', class_='book__stickers')
    if stickers_container:            
        stickers_div = stickers_container.find_all('div', class_='stickers__item')
        sticker_list = []
        for sticker in stickers_div:
            onhover_sticker = sticker.find('span', class_='stickers__icon')
            if onhover_sticker:
                sticker_list.append(onhover_sticker.attrs['title'].strip('\n '))
            else:
                sticker_list.append(sticker.text.strip('\n '))                
        stickers = ";".join(sticker_list)
    availability = False
    if soup.find('span', class_='instock1'):
        availability = True
    desc = soup.find('div', class_='lw')
    
    res = {
        "ID": uid,
        "Автор": soup.find('a', class_='author-name').text,
        "Название": soup.find('h1', class_='page-header__title').text.\
            replace(author_name, "").strip("\r\n "),
        "Обложка": soup.find('img', class_='book__img').attrs['src'],
        "Рейтинг": int(soup.find('div', class_='book___rating-stars').attrs['data-rate']),
        "Стикеры": stickers,
        "Наличие": availability,
        "Цена": float((re.sub('\s+', '', str(price.contents[-1]))).strip()[:-4]) if price != None else '',
        "Описание": re.sub('\s+', ' ', desc.text) if desc != None else '',
    }
    
    book_details = soup.find_all('dl', class_='book__details-item')
    for item in book_details:
        res[item.find('dt', class_='book__details-name').\
            text.replace("\r", "").replace("\n", "").strip(': ')] =\
            item.find('dt', class_='book__details-value').text.replace("\r", "").\
            replace("\n", "").strip(' ')
    return res

In [78]:
from multiprocessing import Pool, Lock, Value
from time import sleep

mutex = Lock()
n_processed = Value('i', 0)

def func(uid):
    return get_book_info(uid)

def func_wrapper(uid):
    res = func(uid) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, books)
    
df = pd.DataFrame(res)
df.sort_values(by=['ID'], inplace=True)

with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)

240 objects are processed...