# Парсинг с помощью BeautifulSoup

Beautiful Soup - это библиотека для Python, которая позволяет парсить (анализировать) HTML и XML документы. Она предоставляет удобный способ искать, навигировать, и модифицировать дерево DOM (Document Object Model), представляющее HTML/XML документ.

# Задание

Вам необходимо собрать датасет, спарсив данные из этого сайта:

https://books.toscrape.com/

Всего на сайте 1000 книг. То есть длина датасета должна равняться количеству книг.
 
Итоговая таблица должна содержать следующие столбцы:

| Название столбца | Описание | 
|--|--|
|id| Идентификатор книги |
|book_name| Название книги |
|price| Цена в £ |
|stock| Наличие книги. 1 или 0|
|url| Ссылка на книгу |

**Примечание по столбцам:**
- `id` - заполняется разработчиком датасета. Первая спарсенная книга имеет `id` = `0`.
- `url` - должна содержать полную ссылку. Не только конец ссылки, указанный на сайте. То есть по данному url можно перейти одним кликом.

## Импорт библиотек

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Cоздание датасета и парсинг данных

In [2]:
URL = 'https://books.toscrape.com/'

In [3]:
class Parser:

    def __init__(self, URL):
        self.URL = URL

    def _create_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(
            {
                "id": [],
                "book_name": [],
                "price": [],
                "stock": [],
                "url": [],
            }
        )

    def _get_list_books_from_page(self, page_num: str):
        url = f"{self.URL}catalogue/page-{page_num}.html"
        response = requests.get(url, timeout=10)  # , verify=False)
        soup = BeautifulSoup(response.text, features="html.parser")
        all_items = soup.find_all("article", {"class": "product_pod"})
        return all_items if len(all_items) else None

    def _get_links(self, page_number) -> list:
        links = []
        items = self._get_list_books_from_page(page_number)
        if items is None:
            return None
        for item in items:
            links.append(item.find("a").get("href"))
        return links

    def _get_books_from_page(self, page_number: str) -> pd.DataFrame | None:
        links = self._get_links(page_number)
        if links is None:
            return None
        df = self._create_dataframe()

        for link in links:
            book = {}
            response = requests.get(f"{self.URL}catalogue/{link}", timeout=10)
            soup_book = BeautifulSoup(response.text, features="html.parser")
            book["id"] = (int(page_number) - 1) * 20 + len(df)
            book["url"] = f"{self.URL}{link}"
            book["book_name"] = soup_book.find("h1").text
            book["price"] = (soup_book.find("p", {"class": "price_color"}).text)[2:]
            book["stock"] = (
                soup_book.find("p", {"class": "instock availability"})
                .text.strip()
                .split("(")[1]
                .split()[0]
            )
            df.loc[len(df)] = book
        return df

    def get_all_books(self):
        buf = []
        page_number = 1
        while True:
            data = self._get_books_from_page(str(page_number))
            if data is None:
                break
            buf.append(data)
            page_number += 1
        full_data = pd.concat(buf, ignore_index=False)
        return full_data

In [4]:
parser = Parser(URL)
df = parser.get_all_books()

## Итоговый датасет

In [5]:
df.shape

(1000, 5)

In [6]:
display(
    df.head(),
    df.tail()
)

Unnamed: 0,id,book_name,price,stock,url
0,0,A Light in the Attic,51.77,22,https://books.toscrape.com/a-light-in-the-atti...
1,1,Tipping the Velvet,53.74,20,https://books.toscrape.com/tipping-the-velvet_...
2,2,Soumission,50.1,20,https://books.toscrape.com/soumission_998/inde...
3,3,Sharp Objects,47.82,20,https://books.toscrape.com/sharp-objects_997/i...
4,4,Sapiens: A Brief History of Humankind,54.23,20,https://books.toscrape.com/sapiens-a-brief-his...


Unnamed: 0,id,book_name,price,stock,url
15,995,Alice in Wonderland (Alice's Adventures in Won...,55.53,1,https://books.toscrape.com/alice-in-wonderland...
16,996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,1,https://books.toscrape.com/ajin-demi-human-vol...
17,997,A Spy's Devotion (The Regency Spies of London #1),16.97,1,https://books.toscrape.com/a-spys-devotion-the...
18,998,1st to Die (Women's Murder Club #1),53.98,1,https://books.toscrape.com/1st-to-die-womens-m...
19,999,"1,000 Places to See Before You Die",26.08,1,https://books.toscrape.com/1000-places-to-see-...


# ЗАДАНИЕ ПРО

Так, мы спарсили данные о книгах. Но данные какие-то неполные. Часть названия стирается из-за отображения и нет ни полного названия книги, ни описания этой книги, ни жанра.

Вам необходимо дополнить датасет, спарсив дополнительные данные из того же сайта:

https://books.toscrape.com/
 
Итоговая таблица должна содержать следующие столбцы:

| Название столбца | Описание | 
|--|--|
|id| Идентификатор книги |
|book_name| Название книги - только полное название|
|genre| жанр книги |
|desc| описание |
|price| Цена в £ |
|stock| Наличие книги. 1 или 0|
|url| Ссылка на книгу |
| num_of_rev | количество отзывов|

## Парсинг данных и обогащение датасета

In [7]:
class Parser:

    def __init__(self, URL):
        self.URL = URL

    def _create_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(
            {
                "id": [],
                "book_name": [],
                "price": [],
                "stock": [],
                "url": [],
                "num_of_rev": [],
                "genre": [],
                "desc": [],
            }
        )

    def _get_list_books_from_page(self, page_num: str):
        url = f"{self.URL}catalogue/page-{page_num}.html"
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, features="html.parser")
        all_items = soup.find_all("article", {"class": "product_pod"})
        return all_items if len(all_items) else None

    def _get_links(self, page_number) -> list:
        links = []
        items = self._get_list_books_from_page(page_number)
        if items is None:
            return None
        for item in items:
            links.append(item.find("a").get("href"))
        return links

    def _get_books_from_page(self, page_number: str) -> pd.DataFrame | None:
        links = self._get_links(page_number)
        if links is None:
            return None
        df = self._create_dataframe()

        for link in links:
            book = {}
            response = requests.get(
                f"{self.URL}catalogue/{link}", timeout=10)
            soup_book = BeautifulSoup(response.text, features="html.parser")
            table = soup_book.find("table").find_all("td")
            book["id"] = table[0].text
            book["url"] = f"{self.URL}{link}"
            book["book_name"] = soup_book.find("h1").text
            book["price"] = table[3].text[2:]
            book["stock"] = 1 if int(table[5].text.split("(")[1].split()[0]) > 0 else 0
            book["num_of_rev"] = table[6].text
            book["genre"] = soup_book.find("ul").find_all("li")[2].text.strip()
            book["desc"] = (
                soup_book.find("article", {"class": "product_page"})
                .find_all("p")[3]
                .text.strip()
            )
            df.loc[len(df)] = book
        return df

    def get_all_books(self):
        buf = []
        page_number = 1
        while True:
            data = self._get_books_from_page(str(page_number))
            if data is None:
                break
            buf.append(data)
            page_number += 1
        full_data = pd.concat(buf, ignore_index=False)
        return full_data

In [8]:
parser = Parser(URL)
df = parser.get_all_books()

## Итоговый датасет PRO

In [9]:
df.shape

(1000, 8)

In [10]:
display(
    df.head(),
    df.tail()
)

Unnamed: 0,id,book_name,price,stock,url,num_of_rev,genre,desc
0,a897fe39b1053632,A Light in the Attic,51.77,1,https://books.toscrape.com/a-light-in-the-atti...,0,Poetry,It's hard to imagine a world without A Light i...
1,90fa61229261140a,Tipping the Velvet,53.74,1,https://books.toscrape.com/tipping-the-velvet_...,0,Historical Fiction,"""Erotic and absorbing...Written with starling ..."
2,6957f44c3847a760,Soumission,50.1,1,https://books.toscrape.com/soumission_998/inde...,0,Fiction,"Dans une France assez proche de la nÃ´tre, un ..."
3,e00eb4fd7b871a48,Sharp Objects,47.82,1,https://books.toscrape.com/sharp-objects_997/i...,0,Mystery,"WICKED above her hipbone, GIRL across her hear..."
4,4165285e1663650f,Sapiens: A Brief History of Humankind,54.23,1,https://books.toscrape.com/sapiens-a-brief-his...,0,History,From a renowned historian comes a groundbreaki...


Unnamed: 0,id,book_name,price,stock,url,num_of_rev,genre,desc
15,cd2a2a70dd5d176d,Alice in Wonderland (Alice's Adventures in Won...,55.53,1,https://books.toscrape.com/alice-in-wonderland...,0,Classics,
16,bfd5e1701c862ac3,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,1,https://books.toscrape.com/ajin-demi-human-vol...,0,Sequential Art,High school student Kei Nagai is struck dead i...
17,19fec36a1dfb4c16,A Spy's Devotion (The Regency Spies of London #1),16.97,1,https://books.toscrape.com/a-spys-devotion-the...,0,Historical Fiction,"In Englandâs Regency era, manners and elegan..."
18,f684a82adc49f011,1st to Die (Women's Murder Club #1),53.98,1,https://books.toscrape.com/1st-to-die-womens-m...,0,Mystery,"James Patterson, bestselling author of the Ale..."
19,228ba5e7577e1d49,"1,000 Places to See Before You Die",26.08,1,https://books.toscrape.com/1000-places-to-see-...,0,Travel,"Around the World, continent by continent, here..."
