## Lab | Single page scraping

#### 1. import libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > cite

#### 2. url: we start with the 'second' page. Show that you can start whenever you want


In [4]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

#### 3. download html with a get request

In [5]:
response = requests.get(url)
response.status_code # 200 status code means OK!

200

#### 4.1. parse html (create the 'soup')

In [6]:
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
soup.title

<title>iTunes Top 100 Songs Chart 2022</title>

##### Buscamos el título

In [8]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > cite

In [9]:
soup.select("#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > cite")

[<cite class="title">Unholy</cite>]

In [10]:
soup.select("cite.title")[:3]

[<cite class="title">Unholy</cite>,
 <cite class="title">Eagle (feat. KB)</cite>,
 <cite class="title">I'm Good (Blue)</cite>]

##### Buscamos el artista

In [11]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > em

In [12]:
soup.select("#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p > em")

[<em class="artist">Sam Smith &amp; Kim Petras</em>]

In [13]:
soup.select("em.artist")[:3]

[<em class="artist">Sam Smith &amp; Kim Petras</em>,
 <em class="artist">Transformation Worship</em>,
 <em class="artist">David Guetta &amp; Bebe Rexha</em>]

##### Buscamos el ranking

In [14]:
#chart-position-1 > div.cover-art.col-xs-12.col-sm-4 > p

In [15]:
soup.select("#chart-position-1 > div.cover-art.col-xs-12.col-sm-4 > p")

[<p class="chart-position">1</p>]

In [16]:
soup.select("p.chart-position")[:3]

[<p class="chart-position">1</p>,
 <p class="chart-position">2</p>,
 <p class="chart-position">3</p>]

##### Buscamos el género

In [17]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > ul > li:nth-child(1) > a

In [18]:
soup.select("#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > ul > li:nth-child(1) > a")

[<a href="/music/charts/top-pop-songs.php">Pop</a>]

In [19]:
soup.select("a.Pop")

[]

## making beautiful soups into beautiful tables

In [20]:
#initialize empty lists
rank = []
title = []
artist = []

# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup.select("cite.title"))

rango = soup.select("p.chart-position")
canciones = soup.select("cite.title")
artistas = soup.select("em.artist")
# género = pass
# spanlist = soup.select("td.titleColumn span.secondaryInfo")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    rank.append(rango[i].get_text())
    title.append(canciones[i].get_text())
    artist.append(artistas[i].get_text())

# print(title)
# print(artist)

#### Scraping/treating Genre and Release date
The website stores all details on parent components but if it is a new release the first element is 
"first release", otherwise is the genre.
So genre, release date are on 0 and 1 index regularly, 1 and 2 if it's a new release.

In [21]:
# list to store all elements
multi = []

for i in range(100):
    multi.append(soup.select("#chart-position-"+str(i+1)+" > div.chart-content.col-xs-12.col-sm-8 > ul > li"))

# genre and release date sublists 
genre = []
release = []

# release date are on 0 and 1 index regularly, 1 and 2 if it's a new release
for element in multi:
    if element[0].get_text() == 'New Release':
        genre.append(element[1].get_text().split('Genre: ')[1])
        release.append(element[2].get_text().split('Release Date: ')[1])
    else:
        genre.append(element[0].get_text().split('Genre: ')[1])
        release.append(element[1].get_text().split('Release Date: ')[1])

In [22]:
artist_song = pd.DataFrame({"rank":rank,
                        "title":title,
                        "artist":artist,
                        "genre":genre,
                        "release_date":release
                        })
artist_song.to_csv("top100songs",index=False)

In [23]:
artist_song.head()

Unnamed: 0,rank,title,artist,genre,release_date
0,1,Unholy,Sam Smith & Kim Petras,Pop,"September 22, 2022"
1,2,Eagle (feat. KB),Transformation Worship,Hip-Hop / Rap,"October 7, 2022"
2,3,I'm Good (Blue),David Guetta & Bebe Rexha,Dance,"August 26, 2022"
3,4,Everywhere,Fleetwood Mac,Rock,"April 14, 1987"
4,5,Make It With You,Bread,Soft Rock,"April 1, 1970"


Function for recommendation

In [38]:
from random import randint


def recomendacion_musical():
    input_song = input("Introduce the name of a song: ")
    for i in artist_song['title']:
        if input_song in i:
            return print("Our recomendation for you today is: ",artist_song['title'][randint(1,len(artist_song['title']))])
        else:
            return(print("Sorry, we do not have a recomendation for you today. Have a nice day!"))

In [39]:
recomendacion_musical()

Sorry, we do not have a recomendation for you today. Have a nice day!


## Lab | Multiple page scraping

## Respectful scraping:

In [26]:
from time import sleep
from random import randint
import numpy as np

In [27]:
# for i in range(5):
#     print(i)
#     wait_time = randint(1,4000)
#     print("I will sleep for " + str(wait_time/1000) + " seconds.")
#     sleep(wait_time/1000)

## Assembling the script to send and store multiple requests