#### Lab - Web scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
import random
from time import sleep

In [3]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [4]:
response = requests.get(url)
response.status_code

200

In [5]:
soup = BeautifulSoup(response.content, "html.parser")

Building the song and artist lists from the scrapped data

In [6]:
#initialize empty lists
position = []
song = []
artist = []

num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite"))

song_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")
art_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")

# iterate through the result set and retrive all the data
for i in range(num_iter):
    song.append(song_list[i].get_text())
    artist.append(art_list[i].get_text())
    position.append(i+1)
    

#### Scraping/treating Genre and Release date
The website stores all details on parent components but if it is a new release the first element is 
"first release", otherwise is the genre.
So genre, release date are on 0 and 1 index regularly, 1 and 2 if it's a new release.

In [7]:
# list to store all elements
multi = []

for i in range(100):
    multi.append(soup.select("#chart-position-"+str(i+1)+" > div.chart-content.col-xs-12.col-sm-8 > ul > li"))

# genre and release date sublists 
genre = []
release = []

# release date are on 0 and 1 index regularly, 1 and 2 if it's a new release
for element in multi:
    if element[0].get_text() == 'New Release':
        genre.append(element[1].get_text().split('Genre: ')[1])
        release.append(element[2].get_text().split('Release Date: ')[1])
    else:
        genre.append(element[0].get_text().split('Genre: ')[1])
        release.append(element[1].get_text().split('Release Date: ')[1])


#### Creating a new dataset with the previously generated lists

In [8]:
top100 = pd.DataFrame({"rank":position,
                       "song":song,
                       "artist":artist,
                       "genre": genre,
                       "release": release
                      })

In [9]:
top100.head(5)

Unnamed: 0,rank,song,artist,genre,release
0,1,Unholy,Sam Smith & Kim Petras,Pop,"September 22, 2022"
1,2,Eagle (feat. KB),Transformation Worship,Hip-Hop / Rap,"October 7, 2022"
2,3,Everywhere,Fleetwood Mac,Rock,"April 14, 1987"
3,4,I'm Good (Blue),David Guetta & Bebe Rexha,Dance,"August 26, 2022"
4,5,wait in the truck,HARDY & Lainey Wilson,Country,"August 26, 2022"


#### Scrapping other sources and styles

In [132]:
# function to scrap the website based on begin year and end year
def scrap_url(main, begin=1970, end= 2021):
    
    df_scrap = pd.DataFrame()
    
    genre = ''
    for year in range(begin, end+1):
        #temporary containers for song and artist
        song = []
        artist = []
        position = []
        
        # scraping individual link    
        url = main + str(year)
        response = requests.get(url)
        print(year, 'request:',response.status_code)
        
        # parse & store html
        soup = BeautifulSoup(response.content, "html.parser")
        # select table component from soup
        tmp = soup.select('#myTable')
        
        if genre == '':
            genre = soup.select('body > div.wrappercenter > div:nth-child(2) > nav > ol > li:nth-child(3) > a > span')[0].get_text()

        #song soup
        song_list = tmp[0].select(".song a")
        #artist soup
        art_list = tmp[0].select(".artist")

        # iterate through the result set and retrive all the data
        for i in range(len(art_list)):
            song.append(song_list[i].get_text().replace('\n',''))
            artist.append(art_list[i].get_text().replace('\n',''))
            position.append(i+1)
        
        # respectful nap:
        wait_time = random.randint(1,3)
        sleep(wait_time)
        
        # creating a temporary dataset to add to the full set to be returned
        df_list_tmp = pd.DataFrame({"rank":position,
                       "song":song,
                       "artist":artist,
                       "genre": genre,
                       "year": year,
                      })
        df_scrap = pd.concat([df_scrap, df_list_tmp], axis = 0)
        
    return df_scrap

#### Scraping multiple sources 

In [None]:
# Run only to rebuild the historical top 100 dataset
df = []

df.append(scrap_url('https://playback.fm/charts/rnb/'))
df.append(scrap_url('https://playback.fm/charts/country/'))
df.append(scrap_url('https://playback.fm/charts/rock/'))
df.append(scrap_url('https://playback.fm/charts/top-100-songs/'))

historic_df = pd.DataFrame()
for i in range(3):
    historic_df = pd.concat([historic_df, df[i]], axis = 0)

##### Saving our scarped data to a csv file for further work

In [138]:
historic_df.to_csv('Top 100 songs 1970-2021.csv',index=False)

In [2]:
historic_df = pd.read_csv('Top 100 songs 1970-2021.csv')

#### Joining Top 100 2022 songs, with 2021 top songs by type

Making a year column in the top 100 dataset, and dropping the release date column so it wont mess up the concat with the other dataset

In [10]:
top100['year'] = pd.DatetimeIndex(top100['release']).year
top100 = top100.drop(columns='release', axis = 1)

Concatenating the lists into a bigger song dataset

In [11]:
top_list = pd.concat([top100, historic_df[historic_df['year']==2021]], axis = 0)
top_list = top_list.reset_index(drop = True)

In [14]:
# User inserts search term
print('Insert song or search term:')
search=input()

# flag that controls if the search string exists in the songs dataset
exist = False

# testing if the search string exists
for i in ['song', 'artist', 'genre']:
    if len(top_list[top_list[i].str.contains(search, case = False, regex = False)]) != 0:
        exist = True
        
# if the song or search term exists in the dataset we sugest a random song, otherwise try tomorrow
if exist == True:
    index = random.randint(0,len(top_list))
    print('\nI have a sugestion! \n\nSong: ',top_list['song'].values[index],  '\nArtist: ', top_list['artist'].values[index])      
else:
    print("\nI don't have a sugestion, try tomorrow :)")

Insert song or search term:
eagle

I have a sugestion! 

Song:  SugarCrash! 
Artist:  ElyOtto
