#### Import Libs

In [57]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

# Constants
from decouple import config
PROCESSED_DATA_PATH     = config('PROCESSED_DATA_PATH')
RAW_PROCESSED_DATA_PATH = config('RAW_PROCESSED_DATA_PATH')
URL_TO_AQUISITION_DATA  = config('URL_TO_AQUISITION_DATA')

# Options to display
pd.set_option('display.max_rows',None)

#### Methods

In [2]:
def get_hrefs(URL_TO_AQUISITION_DATA:str)->list :

    '''
        This method make a list with hrefs of category of books.

        Parameters : 
            URL_TO_AQUISITION_DATA : A string with URL to get data

        Example : 
            get_hrefs(URL_TO_AQUISITION_DATA)
        Returns : 
            ['catalogue/category/books_1/index.html',
            'catalogue/category/books/travel_2/index.html',
            'catalogue/category/books/mystery_3/index.html' ...]
    '''

    list_hrefs = list()
    response = requests.get(URL_TO_AQUISITION_DATA)
    if response.raise_for_status() :
        print('código_quebrado')
    soup = bs(response.content)
    data_ = soup.find('ul', class_='nav nav-list')
    # looping to take hrefs
    for line in data_.find_all('li') :
        dict_ = line.find('a', href=True)
        list_hrefs.append(dict_['href'])
    
    # Ignore first position 'cause a home page
    return list_hrefs[1:]

    

In [3]:
def get_features_books(URL_TO_AQUISITION_DATA:str, hrefs=list)->list :
    for genre in hrefs :
        response = requests.get(URL_TO_AQUISITION_DATA)
        if response.raise_for_status() :
            print('código_quebrado')
        

In [4]:
def get_books_genre (soup) : 
    
    genre = soup.find('title').text.split("|")[0].strip()

    return genre

In [8]:
def get_more_pages (soup) :
    list_pages = list()
    pages = soup.findChild(class_='next')
    if type(pages) != None :
        pages.find('a',href=True)['href']

In [31]:
def get_book_genre_and_names(soup) :
    list_names = list()
    list_genre = list()
    genre = get_books_genre(soup)
    for name in soup.find_all('img',alt=True) :
        list_names.append(name['alt'])
        list_genre.append(genre)
    return list_genre,list_names

In [10]:
def get_book_prices(soup) :
    list_prices = list()
    for price in soup.find_all(class_='price_color') :
        list_prices.append(float(price.text[1:]))
    return list_prices

In [11]:
def get_book_ratings(soup) :
    list_rating = list()
    for rating in soup.find_all('p') :
        rating = str(rating)
        if re.search('star-rating',rating) : 
            span = re.search('star-rating',rating)
            end_word = span.span()[1]
            rating = rating[end_word:rating.index('>')].replace('"','').strip()
            list_rating.append(rating)
        
    return list_rating
            


In [12]:
def get_book_in_stock(soup) : 
    list_status = list()
    for status_ in soup.find_all(class_='instock availability') :
        list_status.append((status_.text.strip()))

    return list_status

In [51]:
def get_features_books(URL_TO_AQUISITION_DATA:str,list_hrefs:list) :
    list_genres = list()
    list_names = list()
    list_prices = list()
    list_ratings = list()
    list_status = list()
    for href in list_hrefs :
        additional_pages = list()
        response = requests.get(URL_TO_AQUISITION_DATA+href)
        soup = bs(response.content)
        # Get genres and names of books
        genres,names = get_book_genre_and_names(soup)
        # Get prices of books
        prices = get_book_prices(soup)
        # Get rating of books
        ratings = get_book_ratings(soup)
        # Get status of stock book
        status = get_book_in_stock(soup)

        # Add data to lists
        list_genres.extend(genres)
        list_names.extend(names)
        list_prices.extend(prices)
        list_ratings.extend(ratings)
        list_status.extend(status)
        
        # Verifying if exists more pages from genre
        if soup.find(class_='current') :
            qtd_pages = int(soup.find(class_='current').text.strip()[-1])

            for page in range(2,qtd_pages+1) :
                str_ = f"page-{page}.html"
                href_ = href.replace('index.html',str_)
                response = requests.get(URL_TO_AQUISITION_DATA+href_)
                soup = bs(response.content)
                # Get names of books
                genres, names = get_book_genre_and_names(soup)
                # Get prices of books
                prices = get_book_prices(soup)
                # Get rating of books
                ratings = get_book_ratings(soup)
                # Get status of stock book
                status = get_book_in_stock(soup)

                # Add data to lists
                list_genres.extend(genres)
                list_names.extend(names)
                list_prices.extend(prices)
                list_ratings.extend(ratings)
                list_status.extend(status)
        else :
            continue


    # Transform in a dataframe
    data_ = pd.concat([pd.Series(list_genres), pd.Series(list_names), 
    pd.Series(list_prices), pd.Series(list_ratings),
    pd.Series(list_status)],axis=1).rename(
        columns={
            0:'genre',1:'names',2:'prices',
            3:'ratings',4:'stock_status'})

    return data_
    