In [1]:
from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool, Manager, Value, Lock
from requests.exceptions import ConnectionError
import pandas as pd

In [2]:
with open('authors.txt') as f:
    authors_id = [i.strip() for i in f.readlines()]

In [3]:
site_name = 'https://www.respublica.ru'

In [4]:
def get_books_url(author_id):
    begin = True
    while True:
        if begin:
            page = requests.get(site_name + f'/authors/{author_id}')
            assert page.status_code < 400
            begin=False
        else:
            page = requests.get(site_name + next_page)
            assert page.status_code < 400
        page = page.text
        soup = BeautifulSoup(page, 'html.parser')
        if soup.find_all('a', class_="rd-listing-product-item__link"):
            books.append([i['href'] for i in soup.find_all('a', class_="rd-listing-product-item__link")])
        
        next_page_region = soup.find('a', class_='pagination-next')
        if next_page_region:
            next_page = next_page_region['href']
        else:
            break
    with Lock():
        global counter
        counter.value += 1
        if counter.value % 10 == 0:
            print(f"{round(counter.value / len(authors_id) * 100,2)}% done\n", end='', flush=True)

In [5]:
books = Manager().list()
counter = Value('i', 0)
with Pool(processes=len(authors_id)) as pool:
    pool.map(get_books_url, authors_id)

28.57% done
57.14% done
85.71% done


In [6]:
all_books = []
for i in books:
    all_books.extend(i)

In [7]:
def process_page(url):
    book_dict = {}
    
    page = requests.get(site_name + url)
    assert page.status_code < 400
    page = page.text
    soup = BeautifulSoup(page, 'html.parser')
    book_dict['URL'] = site_name + url
    
    if soup.find('div', class_='rd-page-product__breadcrumbs'):
        categories = soup.find('div', class_='rd-page-product__breadcrumbs').find_all('span', class_="rd-page-breadcrumbs-item")
        if categories:
            category_list = [i.text for i in categories]
            category = '; '.join([i.strip(' ') for i in category_list])
            book_dict['Категория'] = category
    
    if soup.find('h1', class_="rd-page-product__title"):
        name = soup.find('h1', class_="rd-page-product__title").text
        book_dict['Название'] = name

    if soup.find_all('a', itemprop='brand'):
        author = '; '.join([i.text for i in soup.find_all('a', itemprop='brand')])
        book_dict['Автор'] = author
    
    if soup.find('span', itemprop='sku'):
        ID = soup.find('span', itemprop='sku').text
        book_dict['ID'] = ID
    
    
    if soup.find('a', class_='download-pdf'):
        preview = site_name + soup.find('a', class_='download-pdf')['href']
        book_dict['Превью'] = preview
    
    if soup.find('div', class_='rd-page-product__photo'):
        if soup.find('div', class_='rd-page-product__photo').img['src']:
            image = site_name + soup.find('div', class_='rd-page-product__photo').img['src']
            book_dict['Изображение'] = image
    
    if soup.find('div', class_='rd-page-product__price'):
        if soup.find('div', class_='rd-page-product__price').span:
            price = soup.find('div', class_='rd-page-product__price').span.text
            book_dict['Цена'] = price
    
    old_price = soup.find('div', class_='rd-page-product__price-old')
    if old_price:
        book_dict['Цена (старая)'] = old_price.span.text.split()[0]
    
    if soup.find('span', itemprop='aggregateRating'):
        score = soup.find('span', itemprop='aggregateRating').find('meta', itemprop="ratingValue")['content']
        num_scores = soup.find('span', itemprop='aggregateRating').find('meta', itemprop="ratingCount")['content']
        num_reviews = soup.find('span', itemprop='aggregateRating').find('meta', itemprop="reviewCount")['content']
        
        book_dict['Число отзывов'] = num_reviews
        book_dict['Число оценок'] = num_scores
        book_dict['Оценка'] = score
    
    if soup.find('div', class_='rd-page-product__buttons').a['class'][1].find('available') > -1:
        avaliable = 'True'
    else:
        avaliable = 'False'
    book_dict['В наличии'] = avaliable
    
    if soup.find('div', class_='rd-page-product__desc-body'):
        about = soup.find('div', class_='rd-page-product__desc-body').text
        book_dict['Описание'] = about
    
    characteristics_raw = soup.find('div', class_='rd-page-product__desc-params')
    char_names = characteristics_raw.find_all('span', itemprop='name')
    char_values = characteristics_raw.find_all(itemprop='value')
    for (i,j) in zip(char_names, char_values):
        book_dict[i.text] = j.text
    
    with Lock():
        global counter
        counter.value += 1
        if counter.value % 200 == 0:
            print(f"{round(counter.value / len(all_books) * 100)}% -> ", end='', flush=True)
    return book_dict

In [8]:
counter = Value('i', 0)
with Pool(processes=10) as pool:
    all_dicts = pool.map(process_page, all_books)

8% -> 16% -> 24% -> 33% -> 41% -> 49% -> 57% -> 65% -> 73% -> 81% -> 89% -> 98% -> 

In [9]:
df = pd.DataFrame(all_dicts)
df.sort_values(by=['ID'], inplace=True)

In [10]:
with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)