#### Lab - Web scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
import random
from time import sleep

In [2]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

Building the song and artist lists from the scrapped data

In [5]:
#initialize empty lists
position = []
song = []
artist = []

num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite"))

song_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")
art_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")

# iterate through the result set and retrive all the data
for i in range(num_iter):
    song.append(song_list[i].get_text())
    artist.append(art_list[i].get_text())
    position.append(i+1)
    

#### Scraping/treating Genre and Release date
The website stores all details on parent components but if it is a new release the first element is 
"first release", otherwise is the genre.
So genre, release date are on 0 and 1 index regularly, 1 and 2 if it's a new release.

In [6]:
# list to store all elements
multi = []

for i in range(100):
    multi.append(soup.select("#chart-position-"+str(i+1)+" > div.chart-content.col-xs-12.col-sm-8 > ul > li"))

# genre and release date sublists 
genre = []
release = []

# release date are on 0 and 1 index regularly, 1 and 2 if it's a new release
for element in multi:
    if element:
        if element[0].get_text() == 'New Release':
            genre.append(element[1].get_text().split('Genre: ')[1])
            release.append(element[2].get_text().split('Release Date: ')[1])
        else:
            genre.append(element[0].get_text().split('Genre: ')[1])
            release.append(element[1].get_text().split('Release Date: ')[1])


#### Creating a new dataset with the previously generated lists

In [7]:
top100 = pd.DataFrame({"rank":position,
                       "song":song,
                       "artist":artist,
                       "genre": genre,
                       "release": release
                      })

In [10]:
top100.head(5)

Unnamed: 0,rank,song,artist,genre,year
0,1,Unholy,Sam Smith & Kim Petras,Pop,2022.0
1,2,Eagle (feat. KB),Transformation Worship,Hip-Hop / Rap,2022.0
2,3,I'm Good (Blue),David Guetta & Bebe Rexha,Dance,2022.0
3,4,Everywhere,Fleetwood Mac,Rock,1987.0
4,5,wait in the truck,HARDY & Lainey Wilson,Country,2022.0
...,...,...,...,...,...
95,96,Sand In My Boots,Morgan Wallen,Country,2021.0
96,97,No Se Va (En Vivo),Grupo Frontera,Regional Mexicano,2022.0
97,98,Perfectly Loved (feat. TobyMac),Rachael Lampa,Christian & Gospel,2022.0
98,99,How Far I'll Go,Auli'i Cravalho,Soundtrack,2016.0


Making a year column in the top 100 dataset, and dropping the release date column so it wont mess up the concat with the other dataset

In [9]:
top100['year'] = pd.DatetimeIndex(top100['release']).year
top100 = top100.drop(columns='release', axis = 1)

#### Scrapping other sources and styles

In [12]:
# function to scrap the website based on begin year and end year
def scrap_url(main, begin=1970, end= 2021):
    
    df_scrap = pd.DataFrame()
    genre = ''
    for year in range(begin, end+1):
        #temporary containers for song and artist
        song = []
        artist = []
        position = []
        
        # scraping individual link    
        url = main + str(year)
        response = requests.get(url)
        print(year, 'request:',response.status_code)
        
        # parse & store html
        soup = BeautifulSoup(response.content, "html.parser")
        # select table component from soup
        tmp = soup.select('#myTable')
        
        if genre == '':
            genre = soup.select('body > div.wrappercenter > div:nth-child(2) > nav > ol > li:nth-child(3) > a > span')[0].get_text()

        #song soup
        song_list = tmp[0].select(".song a")
        #artist soup
        art_list = tmp[0].select(".artist")

        # iterate through the result set and retrive all the data
        for i in range(len(art_list)):
            song.append(song_list[i].get_text().replace('\n',''))
            artist.append(art_list[i].get_text().replace('\n',''))
            position.append(i+1)
        
        # respectful nap:
        wait_time = random.randint(1,3)
        sleep(wait_time)
        
        # creating a temporary dataset to add to the full set to be returned
        df_list_tmp = pd.DataFrame({"rank":position,
                       "song":song,
                       "artist":artist,
                       "genre": genre,
                       "year": year,
                      })
        df_scrap = pd.concat([df_scrap, df_list_tmp], axis = 0)
        
    return df_scrap

#### Scraping multiple sources 

In [None]:
# # Run only to rebuild the historical top 100 dataset
# df = []

# df.append(scrap_url('https://playback.fm/charts/rnb/'))
# df.append(scrap_url('https://playback.fm/charts/country/'))
# df.append(scrap_url('https://playback.fm/charts/rock/'))
# df.append(scrap_url('https://playback.fm/charts/top-100-songs/'))

# historic_df = pd.DataFrame()
# for i in range(3):
#     historic_df = pd.concat([historic_df, df[i]], axis = 0)

#### Scraping more sources

In [13]:
def scrap_popvortex(country_list, full=False):
    scrap = pd.DataFrame()
    for url in country_list:
        #initialize empty lists
        position = []
        song = []
        artist = []
        multi = []
        # genre and release date sublists 
        genre = []
        release = []

        
        if full:
            response = requests.get('https://www.popvortex.com/music/' +str(url))
        else:
            response = requests.get('https://www.popvortex.com/music/' +str(url)+ '/top-songs.php')
        print(url, 'status', response.status_code)

        soup = BeautifulSoup(response.content, "html.parser")

        num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite"))
        song_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")
        art_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")

        # iterate through the result set and retrive all the data
        for i in range(num_iter):
            song.append(song_list[i].get_text())
            artist.append(art_list[i].get_text())
            position.append(i+1)

        for i in range(100):
            multi.append(soup.select("#chart-position-"+str(i+1)+" > div.chart-content.col-xs-12.col-sm-8 > ul > li"))     

        # release date are on 0 and 1 index regularly, 1 and 2 if it's a new release
        for element in multi:
            if element:
                if element[0].get_text() == 'New Release':
                    genre.append(element[1].get_text().split('Genre: ')[1])
                    release.append(element[2].get_text().split('Release Date: ')[1])
                else:
                    genre.append(element[0].get_text().split('Genre: ')[1])
                    release.append(element[1].get_text().split('Release Date: ')[1])

        tmp = pd.DataFrame({"rank":position,
                               "song":song,
                               "artist":artist,
                               "genre": genre,
                               "release": release
                              })
        
        scrap = pd.concat([scrap, tmp], axis = 0)
    
    return scrap

In [14]:
countries = ['austria', 'belgium', 'canada', 'finland', 'germany', 'greece', 'india', 'ireland', 'italy', 
            'mexico', 'netherlands', 'norway', 'philippines', 'poland', 'south-africa', 'spain', 'sweden', 'switzerland' ]

# top100 in various countries 
tops = scrap_popvortex(countries)
tops['year'] = 2022
tops = tops.drop(columns='release', axis = 1)

austria status 200
belgium status 200
canada status 200
finland status 200
germany status 200
greece status 200
india status 200
ireland status 200
italy status 200
mexico status 200
netherlands status 200
norway status 200
philippines status 200
poland status 200
south-africa status 200
spain status 200
sweden status 200
switzerland status 200


##### Saving our scraped data to a csv file for further work

In [None]:
# historic_df.to_csv('Top 100 songs 1970-2021.csv',index=False)

In [15]:
historic_df = pd.read_csv('Data/Top 100 songs 1970-2021.csv')

#### Joining Top 100 2022 songs, with 2021 top songs by type

Concatenating the lists into a bigger song dataset

In [16]:
top_list = pd.concat([top100, tops ,historic_df[historic_df['year']==2021]], axis = 0)
top_list = top_list.reset_index(drop = True)

In [20]:
top_list.drop_duplicates(subset='song', keep='first').to_csv('Top 100 songs worldwide.csv',index=False)