In [16]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
import re

In [9]:
target = {
  "Достоевский Ф. М.": 9150,
  "Роллинс Дж.": 59396,
  "Фицджеральд Ф. С.": 28727,
  "Глуховский Д. А.": 53427,
  "Стругацкий А. Н.": 26268,
  "Лукьяненко С. В.": 16626,
  "Фрай М.": 28927,
  "Хантер Э.": 37969,
  "Роулинг Дж. К.": 104832

}

In [10]:
def get_author_books(author_id):
    template = 'https://www.moscowbooks.ru/catalog/author/{}'
    books = []
    pg_num = 1
    while(True):
        for i in range(5):
            r = requests.get(template.format(author_id), params = {'page': str(pg_num)})
            if r.status_code == 200:
                break
                
        if r.status_code != 200:
            print('cant download author {0}, status_code={1}'.format(author_id, r.status_code))
            return
        
        soup = BeautifulSoup(r.text, 'lxml')
        tags = soup.find_all('div', class_='book-preview__fav fav js-fav')
        res = [tag['data-productid'] for tag in tags]
        
        if res == []:
            break
        books += res
        pg_num += 1
    
    return books



In [11]:
def extract_book_info(book_id):
    book_template = 'https://www.moscowbooks.ru/book/{}'
    
    for i in range(5):
        r = requests.get(book_template.format(book_id))
        if r.status_code == 200:
            break
                
    if r.status_code != 200:
        print('cant download book {0}, status_code={1}'.format(book_id, r.status_code))
        return
    
    soup = BeautifulSoup(r.text, 'lxml')
    info = {}
    info['Код товара'] = book_id
    
    tag = soup.find('div', class_='page-header__author')
    tags = tag.find_all('a', class_='author-name')
    res = ''
    for tag in tags:
        res = res + tag.text.strip() + ', '
        
    info['Автор'] = res[:-2]
    
    info['Название'] = soup.find('meta', property="og:title")['content'].strip()
    
    tag = soup.find('div', class_='book__shop-details')
    info['Наличие'] = (tag.find('span', class_='instock1') != None)
    
    tag = soup.find('div', class_='book__price')
    info['Цена'] = re.sub('\D', '', tag.text)                       
                       
    
    tag = soup.find('div', class_='book__social')
    tag2 = tag.find('div', class_='book__rating', recursive=False)
    tag3 = tag2.find('div', class_='book___rating-stars rating-stars rating-stars_lg')
    info['Рейтинг'] = tag3['data-rate']
    
    info['Обложка'] = "https://www.moscowbooks.ru" + soup.find('img', class_="book__img book__img_default gallery__img")['src']
    
    tag=soup.find('div', class_='book__stickers stickers stickers_lg')
    if tag:
        labels=tag.find_all('div', class_=re.compile('label\w*'))
        if labels:
            res = ''
            for label in labels:
                res = res + label.text + ', ' 
            info['Стикеры'] = res[:-2]
                       
    tag = soup.find('div', class_='book__description collapsed js-book-description')
    tag.find('b').extract()
    tag.find('a').extract()
    info['Описание'] = tag.text.strip()                  
    
    details = soup.find_all('dl', class_='book__details-item')
    for a in details:
        info[re.sub(':', '', a.find('dt', class_='book__details-name').text.strip())] = a.find('dt', class_='book__details-value').text.strip()
    
    return info

In [13]:
all_books = []
for author_id in target.values():
    all_books += get_author_books(author_id)

In [14]:
len(all_books)

237

In [17]:
%%time

with ThreadPool(10) as pool:
    res = pool.map(extract_book_info, all_books)
pool.join()   

Wall time: 3min 15s


In [19]:
df = pd.DataFrame(res)
df.sort_values(by=['Код товара'], inplace=True)

In [21]:
with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)