In [1]:
from bs4 import BeautifulSoup
import requests
import sys
from multiprocessing import Pool, Manager, Value, Lock
from requests.exceptions import ConnectionError
import pandas as pd
from itertools import chain
import time
import functools

In [2]:
authors_id = list(map(str.strip, open('authors.txt', 'r')))

In [3]:
site_name = 'https://www.respublica.ru'

In [4]:
def wrapper(func):
    @functools.wraps(func)
    def decorated_func(*args, **kwargs):
        try:
            result = func(*args, **kwargs)
        except Exception as e:
            print(e)
            result = None
        with lock:
            global counter
            counter.value += 1
            if counter.value % 50 == 0:
                print(f"{counter.value} -> ", end='', flush=True)
        return result
    return decorated_func

In [5]:
@wrapper
def get_page(url, n_attempts=5, t_sleep=1, **kwargs):
    for _ in range(n_attempts):
        page = requests.get(url, **kwargs)
        if page.status_code >= 400:
            time.sleep(t_sleep)
        else:
            return page.text
    print(f'{n_attempts} attempts failed for url {url}', file=sys.stderr)

In [6]:
def get_books_url(author_id):
    begin = True
    while True:
        if begin:
            page = get_page(site_name + f'/authors/{author_id}')
            begin=False
        else:
            page = get_page(site_name + next_page)
        soup = BeautifulSoup(page, 'html.parser')
        if soup.find_all('a', class_="rd-listing-product-item__link"):
            books.append([i['href'] for i in soup.find_all('a', class_="rd-listing-product-item__link")])
        
        next_page_region = soup.find('a', class_='pagination-next')
        if next_page_region:
            next_page = next_page_region['href']
        else:
            break

In [7]:
lock = Lock()
books = Manager().list()
counter = Value('i', 0)
with Pool(processes=len(authors_id)) as pool:
    pool.map(get_books_url, authors_id)

50 -> 100 -> 

In [8]:
all_books = list(chain.from_iterable(books))

In [9]:
def process_page(url):
    book_dict = {}
    
    page = get_page(site_name + url)
    soup = BeautifulSoup(page, 'html.parser')
    book_dict['URL'] = site_name + url
    
    category_tmp = soup.find('div', class_='rd-page-product__breadcrumbs')
    if category_tmp:
        categories = category_tmp.find_all('span', class_="rd-page-breadcrumbs-item")
        if categories:
            category_list = [i.text for i in categories]
            category = '; '.join([i.strip(' ') for i in category_list])
            book_dict['Категория'] = category
    
    name_tmp = soup.find('h1', class_="rd-page-product__title")
    if name_tmp:
        name = name_tmp.text
        book_dict['Название'] = name

    author_tmp = soup.find_all('a', itemprop='brand')
    if author_tmp:
        author = '; '.join([i.text for i in author_tmp])
        book_dict['Автор'] = author
    
    ID_tmp = soup.find('span', itemprop='sku')
    if ID_tmp:
        ID = ID_tmp.text
        book_dict['ID'] = ID
    
    preview_tmp = soup.find('a', class_='download-pdf')
    if preview_tmp:
        preview_tmp1 = preview_tmp['href']
        if preview_tmp1:
            preview = site_name + preview_tmp1
            book_dict['Превью'] = preview
    
    img_tmp1 = soup.find('div', class_='rd-page-product__photo')
    if img_tmp1:
        img_tmp2 = img_tmp1.img['src']
        if img_tmp2:
            image = site_name + img_tmp2
            book_dict['Изображение'] = image
    
    price_tmp1 = soup.find('div', class_='rd-page-product__price')
    if price_tmp1:
        price_tmp2 = price_tmp1.span
        if price_tmp2:
            price = price_tmp2.text
            price = "".join(price.split())
            book_dict['Цена'] = int(price)
    
    old_price = soup.find('div', class_='rd-page-product__price-old')
    if old_price:
        old_price = old_price.span.text.split()[0]
        old_price = "".join(old_price.split())
        book_dict['Цена (старая)'] = int(old_price)
    
    ratings_tmp = soup.find('span', itemprop='aggregateRating')
    if ratings_tmp:
        score_tmp = ratings_tmp.find('meta', itemprop="ratingValue")
        if score_tmp:
            score = score_tmp['content']
            book_dict['Оценка'] = score
        num_scores_tmp = ratings_tmp.find('meta', itemprop="ratingCount")
        if num_scores_tmp:
            num_scores = num_scores_tmp['content']
            book_dict['Число оценок'] = num_scores
        num_reviews_tmp = ratings_tmp.find('meta', itemprop="reviewCount")
        if num_reviews_tmp:
            num_reviews = num_reviews_tmp['content']
            book_dict['Число отзывов'] = num_reviews
    
    avaliable = soup.find('div', class_='rd-page-product__buttons').a['class'][1].find('available') > -1
    book_dict['В наличии'] = avaliable
    
    about_tmp = soup.find('div', class_='rd-page-product__desc-body')
    if about_tmp:
        about = about_tmp.text
        book_dict['Описание'] = about
    
    characteristics_raw = soup.find('div', class_='rd-page-product__desc-params')
    char_names = characteristics_raw.find_all('span', itemprop='name')
    char_values = characteristics_raw.find_all(itemprop='value')
    book_dict.update(zip([i.text for i in char_names], [i.text for i in char_values]))
    
    return book_dict

In [10]:
counter = Value('i', 0)
print(f'Book pages to process: {len(all_books)}')
with Pool(processes=10) as pool:
    all_dicts = pool.map(process_page, all_books)

Book pages to process: 2461
50 -> 100 -> 150 -> 200 -> 250 -> 300 -> 350 -> 400 -> 450 -> 500 -> 550 -> 600 -> 650 -> 700 -> 750 -> 800 -> 850 -> 900 -> 950 -> 1000 -> 1050 -> 1100 -> 1150 -> 1200 -> 1250 -> 1300 -> 1350 -> 1400 -> 1450 -> 1500 -> 1550 -> 1600 -> 1650 -> 1700 -> 1750 -> 1800 -> 1850 -> 1900 -> 1950 -> 2000 -> 2050 -> 2100 -> 2150 -> 2200 -> 2250 -> 2300 -> 2350 -> 2400 -> 2450 -> 

In [11]:
df = pd.DataFrame(all_dicts)
df.sort_values(by=['ID'], inplace=True)

In [12]:
with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)