# TP: Scraping BeautifulSoup & Selenium

## Imports

In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Constants

In [2]:
URL_BOOK_BASE = 'https://www.babelio.com'
URL_BOOK_LISTE = '/livrespopulaires_debut.php?p={page_number}'
URL_AUTHORS = 'https://www.babelio.com/auteurspopulaires.php'
URL_API_BASE = "https://en.wikipedia.org/w/api.php"

## Exercice nÂ°1 : BeautifulSoup Introduction

In [3]:
def get_html_from_link(page_link):
    '''
        Get HTML from web page and parse it.

        :param page_link: link of the webpage we want to scrap
        :type page_link: string
        :return: BeautifulSoup object (HTML parsed)
        :rtype: bs4.BeautifulSoup
    '''

    page = requests.get(page_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

In [4]:
def extract_book_info(book_html):
    '''
        Extract book infos from URL BOOK HTML

        :param book_html: BeautifulSoup Element that contains book infos
        :type book_html: bs4.element.Tag
        :return:
            - book_links: link to the book page
            - book_title : title of the book
            - book_image_link: link to the image of the book
        :rtype: tuple(string, string, string)
    '''

    # TODO : Get book_link, book_title and book_image_link from book_html and return this tuple

    return book_links, book_title, book_image_link

In [5]:
def extract_author_info(author_html):
    '''
        Extract author info from URL BOOK HTML

        :param author_html: BeautifulSoup Element that contains author info
        :type author_html: bs4.element.Tag
        :return:
            - author_links: link to the author page
            - author_name : name of the author
        :rtype: tuple(string, string)
    '''

    # TODO : Get author_links, author_name from author_html and return this tuple
    
    return author_links, author_name

In [6]:
def extract_rate_from_book_page(book_link):
    '''
        Extract rate from book details page

        :param book_link: link of the book we want to extract rate
        :type book_link: string
        :return: rate: rate of the book
        :rtype: float

    '''

    # TODO : get html of book page, get rate from parsed html and return rate as float or nan value if there is no rate.

    return rate

In [7]:
def get_info_from_page(page_link):
    '''
        Get Info from Bebelio page that contains list of books

        :param page_link: link of the webpage we want to scrap
        :type: page_link: string
        :return info_list: list that contains book info (book_links,
        book_title, book_image) and author info (author_links, author_name)
        :rtype: List
    '''

    # TODO : get html from page_link, extract books from html (1), iterate over books and for each book, extract book
    #  info html and author info html, use functions to extract book info and author info and store all these
    #  information in a list (2) and return it
    #  Hints :
    #   (1) Analyze html code and use Beautiful soup function to find elements (take a look at html tag
    #       AND class attributes)
    #   (2) Append a tuple of all information for each book

    return info_list

In [8]:
def collect_all_information_and_save(file_name):
    '''
        "Main function" that collects all information from scraping babelio and using wikipedia api:
            - get info from list pages (page 1, 2, 3 and 4)
            - get rate of each book
            - get author information from wikipedia API (bootcamp_cri.api.api_wikipedia)
        ,store all these information in a pandas dataframe with following columns :
            - links, title, image_link, author_link, author, rate
        and save it in csv file.

        :param file_name: name of the csv file
        :type file_name: string

    '''

    # TODO : Iterate over pages, get information for each page and store it in a list and then create a dataframe with
    #  these information then extract rate and put it in a column and finally save the dataframe in a CSV file
    

## Exercice nÂ°2: Scraping using BeautifulSoup and aggregate with Wikipedia Api data

In [9]:
def get_author_info_from_wikipedia(author_name):
    '''
        Get author detailed info from wikipedia API

        :param author_name:
        :type: author_nanme: string
        :return: json_response: response of wikipedia API
        :rtype: dict
    '''
    params = {
        'action': "query",
        'titles': author_name,
        'format': "json",
        'prop': 'extracts|categories',
        'explaintext': True,
        'exintro': True

    }

    try:
        req = requests.get(url=URL_API_BASE, params=params)
        json_response = req.json()
        tmp_resp = json_response['query']['pages']
        nb_key = list(tmp_resp.keys())[0]
        clean_json_response = json_response['query']['pages'][nb_key]['extract']
    except:
        return None

    return clean_json_response

In [10]:
def extract_most_read_authors(soup):
    # TODO: Get most read authors by finding html tag & class, add each author in a list and return this list
    
    return authors

In [11]:
def extract_number_of_readers(soup):
    # TODO: Get number of readers by html tag & class, add each number of readers in a list and return this list
    
    return nb_readers

In [12]:
def extract_all(url):
    # TODO: get soup by extracting html from link, extracting most read authors and exctracting number of readers
    # then build a dataframe and return it
    
    return df

In [13]:
extract_all(URL_AUTHORS)

Unnamed: 0,authors,nb_readers,authors_description
0,Kim Liggett,176,
1,Franck Bouysse,109,
2,Tiffany McDaniel,107,
3,Emmanuel CarrÃ¨re,107,Emmanuel CarrÃ¨re (born 9 December 1957) is a F...
4,Stephen King,106,"Stephen Edwin King (born September 21, 1947) i..."
5,Franck Thilliez,99,Franck Thilliez (born 15 October 1973 in Annec...
6,Ken Follett,97,"Kenneth Martin Follett, (born 5 June 1949) is..."
7,Dan Simmons,93,"Dan Simmons (born April 4, 1948) is an America..."
8,Harlan Coben,89,"Harlan Coben (born January 4, 1962) is an Amer..."
9,AmÃ©lie Nothomb,85,"Baroness Fabienne-Claire Nothomb, better known..."


## Selenium

In [14]:
URL_YOUTUBE = 'https://www.youtube.com/c/BonEntendeurMusicFr/videos'

### Test with BeautifulSoup

In [15]:
youtube_soup = get_html_from_link(URL_YOUTUBE)

In [16]:
youtube_soup

<!DOCTYPE doctype html>
<html dir="ltr" gl="FR" lang="fr-FR" style="font-size: 10px;font-family: Roboto, Arial, sans-serif;"><head><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="AhbmRDASY7NuOZD9cFMgQihZ+mQpCwa8WTGdTx82vSar9ddBQbziBfZXZg+ScofvEZDdHQNCEwz4yM7HjBS9RgkAAABneyJvcmlnaW4iOiJodHRwczovL3lvdXR1YmUuY29tOjQ0MyIsImZlYXR1cmUiOiJXZWJDb21wb25lbnRzVjAiLCJleHBpcnkiOjE2MDM0ODY4NTYsImlzU3ViZG9tYWluIjp0cnVlfQ==" data-expires="2020-10-23" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="Av2+1qfUp3MwEfAFcCccykS1qFmvLiCrMZ//pHQKnRZWG9dldVo8HYuJmGj2wZ7nDg+xE4RQMQ+Ku1zKM3PvYAIAAABmeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZS5jb206NDQzIiwiZmVhdHVyZSI6IldlYkNvbXBvbmVudHNWMCIsImV4cGlyeSI6MTYwMzgzNjc3MiwiaXNTdWJkb21haW4iOnRydWV9" data-expires="2020-10-27" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="AixUK+8UEShlt6+JX1wy9eg+XL+eV5PYSEDPH3C90JNVbIkE1Rg1FyVUfu2bZ/y6Pm1xbPLzuwHYHjv4uKPNnA4AAABqeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZXByb2QuY2

In [17]:
youtube_soup.find('div', {'class': 'style-scope ytd-channel-video-player-renderer'})

### Test with Selenium

To use selenium, you need to install a driver for your browser, here I recommand to use Chrome and download this driver: https://chromedriver.chromium.org/downloads 

**/!\** Take care to choose correct version depending of your chrome version.

#### Imports

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [19]:
def get_content_with_selenium(url, chrome_drive_path='/Users/maxime/Downloads/chromedriver'):
    option = webdriver.ChromeOptions()
    option.add_argument(" â€” incognito")

    browser = webdriver.Chrome(executable_path=chrome_drive_path, chrome_options=option)
    browser.get(url)

    # Wait 20 seconds for page to load
    timeout = 20
    try:
        WebDriverWait(browser, timeout)
        return browser
    except TimeoutException:
        print("Timed out waiting for page to load")
        browser.quit()

In [20]:
browser = get_content_with_selenium(URL_YOUTUBE)

In [21]:
def get_titles_of_videos(browser):
    # TODO: extract titles of video and add each title in a list then return this list
    
    return titles

In [22]:
get_titles_of_videos(browser)

['Bon Entendeur Radio invite : Bombyce (Exclusive Mix #20)',
 'Bon Entendeur Radio invite : Delfonic (Exclusive Mix #19)',
 'Bon Entendeur Radio invite Jack Tenis (Exclusive Mix #18)',
 'Bon Entendeur : "l\'Original", Baer, Summer 2020',
 'Bon Entendeur Radio invite : Bosq (Exclusive Mix #17)',
 'Bon Entendeur : "la Gentillesse", Kateb, July 2020',
 'Bon Entendeur - Winter US / CA Tour 2020 ðŸ‡ºðŸ‡¸ ðŸ‡¨ðŸ‡¦',
 'Arte Concert x Bon Entendeur au SacrÃ© (Paris)',
 'Bon Entendeur Radio invite : Pete Herbert (Exclusive Mix #16)',
 'Bon Entendeur Radio invite : Air ZaÃ¯re (Exclusive Mix #15)',
 'Bon Entendeur Radio invite : Ben Spalding (Exclusive Mix #14)',
 'Bon Entendeur Radio invite : Ilya Santana (Exclusive Mix #13)',
 'Bon Entendeur Radio invite : Dicky Trisco (Exclusive Mix #12)',
 'Bon Entendeur Radio invite : Afterclapp (Exclusive Mix #11)',
 'Bon Entendeur Radio invite : Soultronic (Exclusive Mix #10)',
 'Bon Entendeur : "le Voyage", Horn, April 2020',
 'Bon Entendeur : "les Valeur

In [23]:
def get_links_of_video(browser):
    # TODO: extract links of each video and add it in a list then return this list
    
    return links

In [24]:
get_links_of_video(browser)

['https://www.youtube.com/watch?v=AzWHEJp8010',
 'https://www.youtube.com/watch?v=YzhKZtqKXwU',
 'https://www.youtube.com/watch?v=0pVScBqBTu8',
 'https://www.youtube.com/watch?v=BzcK82BHMaw',
 'https://www.youtube.com/watch?v=0XeidOeSzE4',
 'https://www.youtube.com/watch?v=evBjde5anQY',
 'https://www.youtube.com/watch?v=keTVP-5B1J8',
 'https://www.youtube.com/watch?v=XpguL_LZZys',
 'https://www.youtube.com/watch?v=ulscr6WYZfc',
 'https://www.youtube.com/watch?v=eMl2Z2yXFM0',
 'https://www.youtube.com/watch?v=eDJU4hZa13I',
 'https://www.youtube.com/watch?v=yiGOx7IlwV4',
 'https://www.youtube.com/watch?v=G5OruvqxwzY',
 'https://www.youtube.com/watch?v=vUx7VXYTr1A',
 'https://www.youtube.com/watch?v=0skAKtpheHM',
 'https://www.youtube.com/watch?v=5fURuyp05TM',
 'https://www.youtube.com/watch?v=R0XTjFnxD64',
 'https://www.youtube.com/watch?v=iY4Z_AiGoxU',
 'https://www.youtube.com/watch?v=xMut2BViyqM',
 'https://www.youtube.com/watch?v=pnm41xGOYpE',
 'https://www.youtube.com/watch?v=Z2YLig