## Imports

In [None]:
## Data Management
import pandas as pd
import time
## Web Scraping
import requests
from bs4 import BeautifulSoup
## API Access
import musicbrainzngs
## Detecting language
from langdetect import detect
## Avoiding rate limit
from tqdm import tqdm
## Tokenization and model
from transformers import AutoTokenizer, pipeline

# Warning cleaning
import warnings
warnings.filterwarnings('ignore')
# All columns and rows setting
pd.options.display.max_columns = None
pd.options.display.max_rows = None

  from .autonotebook import tqdm as notebook_tqdm


##
<p style="background-color:#178268; font-family: arial black; color:#FFF9ED; font-size: 300%; text-align: center;">BillBoard DataFrame</p>

### Extracting Top 25 Most Popular Songs at Billboard per Year by Scrapping Wikipedia

In [2]:
def top_songs_billbord(start_year, ending_year, num_songs):
    # Define variables for the loop
    year_list = list(range(start_year, ending_year + 1))
    data = []

    for year in year_list:
        # Paramteres for Web Scraping
        url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
        table = soup.find("table", class_="wikitable")
        # Sometimes the artist is in two consecutive rows, so we have to check rowspan merges
        last_artist = None

        # Extracting the data from the table
        for row in table.find_all("tr")[1:num_songs+1]:
            cols = row.find_all(["th", "td"])
            if not cols:
                continue
            
            # Extracting position, title, and artist. And adding the year to the data
            pos = int(cols[0].get_text(strip=True).replace(".", ""))
            title = cols[1].get_text(strip=True).strip('"')
            
            # Handle missing artist by tracking the last one seen
            if len(cols) > 2:
                artist = cols[2].get_text(strip=True)
                last_artist = artist
            else:
                artist = last_artist
            data.append({"year": year, "pos": pos, "title": title, "artist": artist})
    
    # data to df and sorting
    df = pd.DataFrame(data)
    df = df.sort_values(by=["year", "pos"]).reset_index(drop=True)
    # Tidy artist names
    df['artist'] = df['artist'].str.split(r'featuring|and').str[0].str.strip()
    
    return df

In [3]:
## Tunning all the parameters ##
starting_year = 2005
ending_year = 2024
num_songs_per_year = 25

In [4]:
df = top_songs_billbord(starting_year, ending_year, num_songs_per_year)
df.head(10)

Unnamed: 0,year,pos,title,artist
0,2005,1,We Belong Together,Mariah Carey
1,2005,2,Hollaback Girl,Gwen Stefani
2,2005,3,Let Me Love You,Mario
3,2005,4,Since U Been Gone,Kelly Clarkson
4,2005,5,"1, 2 Step",Ciara
5,2005,6,Gold Digger,Kanye West
6,2005,7,Boulevard of Broken Dreams,Green Day
7,2005,8,Candy Shop,50 Cent
8,2005,9,Don't Cha,The Pussycat Dolls
9,2005,10,Behind These Hazel Eyes,Kelly Clarkson


### Extracting Songs Lyrics by Scraping Genius.com

In [5]:
## Parameters for accesing to Genius API ##
CLIENT_ACCESS_TOKEN = 'DiGSDKH0utn3vuH9FPl99_XxPsUYu2Y0boPOz25AEDHxWqTwNfQ4Ou0pfcgmbvCY' 
BASE_URL = 'https://api.genius.com/search'
headers = {'Authorization': f'Bearer {CLIENT_ACCESS_TOKEN}'}

In [6]:
# Function to get the song URL from Genius API
def get_song_url(title, artist):
    # Parameters for the Genius API search
    params = {'q': f'{title} {artist}'}
    response = requests.get(BASE_URL, headers=headers, params=params)
    response.raise_for_status()
    # JSON with the search results & extracting the first song URL
    hits = response.json()['response']['hits']
    if hits:
        return hits[0]['result']['url']
    return None

In [None]:
# Function to scrape lyrics from a Genius song URL and tidy them
def scrape_lyrics(url):
    # Parameters for the Genius API search
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    containers = soup.select('div[data-lyrics-container="true"]')
    # Aux variable for the lyrics
    lyrics = ''

    # Loop through the containers to extract lyrics and parse the next line (br)
    for c in containers:
        for element in c.children:
            if isinstance(element, str):
                lyrics += element.strip() + '\n'
            elif element.name == 'br':
                lyrics += '\n'
            else:
                lyrics += element.get_text(strip=True) + '\n'
    
    # Before the lyrics, there are some metadata that we want to remove
    # And usually the 1sr verse of lyrics start with '[Intro', '[Verse 1' or '[Chorus'
    labels = ['[Verse 1', '[Intro', '[Chorus']
    indixes = [lyrics.find(l) for l in labels if lyrics.find(l) != -1]
    if indixes:
        idx = min(indixes)
        lyrics = lyrics[idx:]
    
    
    return lyrics.strip()

In [8]:
# Add a 'lyrics' column to the dataframe by fetching lyrics for each song
def fetch_lyrics_for_df(df):
    lyrics_list = []
    for _, row in df.iterrows():
        url = get_song_url(row['title'], row['artist'])
        if url:
            try:
                lyrics = scrape_lyrics(url)
            except Exception:
                lyrics = None
        else:
            lyrics = None
        lyrics_list.append(lyrics)
        time.sleep(0.25)  # To avoid rate limiting
    
    # To df
    df['lyrics'] = lyrics_list
    return df

df = fetch_lyrics_for_df(df)

- Now we calculate the number of songs that we weren't able to get succesfully the lyrics. We are going to use the same labels as we previously used to remove the metadata.

In [9]:
labels = ['[Verse 1', '[Intro', '[Chorus']
not_starting_with_label = df['lyrics'].dropna().apply(lambda x: not any(x.startswith(label) for label in labels))
not_starting_with_label.sum()

np.int64(16)

- As we can appriciate only 16 out of 500 (25 songs * 20 years), around a 3.2% of the songs dont have the lyrics. So we are going to drop them.
- Also tidy the lyrics for the further sentimental analysis

In [10]:
# Drop rows where lyrics do not start with any of the specified labels
df = df.loc[~not_starting_with_label].reset_index(drop=True)
# Tidy the lyrics
df['lyrics'] = df['lyrics'].str.replace('\n', ' ', regex=False)
df['lyrics'] = df['lyrics'].str.replace(r'\[.*?\]', ' ', regex=True)

In [11]:
df.head(10)

Unnamed: 0,year,pos,title,artist,lyrics
0,2005,1,We Belong Together,Mariah Carey,"Sweet love, yeah I didn't mean it when..."
1,2005,2,Hollaback Girl,Gwen Stefani,"Uh-huh, this my shit All the girls, stomp ..."
2,2005,3,Let Me Love You,Mario,"Mmm, ah Mmm, yeah Mmm Yeah, yeah, yeah ..."
3,2005,4,Since U Been Gone,Kelly Clarkson,"Here's the thing, we started out friendsI..."
4,2005,5,"1, 2 Step",Ciara,"Ladies and gentlemen (Ladies and gentlemen,..."
5,2005,6,Gold Digger,Kanye West,"She take my money when I'm in needYeah, she's..."
6,2005,7,Boulevard of Broken Dreams,Green Day,I walk a lonely roadThe only one that I h...
7,2005,8,Candy Shop,50 Cent,"Yeah, uh-huhSo seductive I'll take y..."
8,2005,9,Don't Cha,The Pussycat Dolls,"Okay ( Ahh ) Yeah ( Ahh ) Oh, we about to..."
9,2005,10,Behind These Hazel Eyes,Kelly Clarkson,"Oh-oh, oh, oh, oh Oh-oh, oh, oh Seems..."


### Adding the genre of the artist with musicbrainzngs

In [12]:
# Initialize API
musicbrainzngs.set_useragent("GenreScript", "1.0")

# Function to get the genre of an artist using MusicBrainz API
def get_genre(artist):
    # Search for the artist in MusicBrainz
    result = musicbrainzngs.search_artists(artist=artist, limit=1)
    # If artist found, get the genre from the first result
    if result['artist-list']:
        artist_id = result['artist-list'][0]['id']
        detalles = musicbrainzngs.get_artist_by_id(artist_id, includes=['tags'])
        tags = detalles['artist'].get('tag-list', [])
        if tags:
            # Return the first genre tag
            return tags[0]['name']
    return "unknown"

In [13]:
# Add a 'genre' column to the df
def find_genres(df):
    genres = []
    for artist in tqdm(df['artist'], desc="Fetching genres"): # To avoid rate limiting
        genre = get_genre(artist)
        genres.append(genre)
        #time.sleep(1)  
    df['genre'] = genres
    return df

df = find_genres(df)

Fetching genres: 100%|██████████| 484/484 [16:07<00:00,  2.00s/it]


In [14]:
df.genre.nunique()

81

- Now we are going to map some genres, since we don't want to have that much number of genres, so we are going to condense mannualy them into bigger groups.

In [15]:
genre_map = {
    "2010s": "2010s","2000s": "2000s","contemporary r&b": "r&b","hip hop": "hip-hop","alternative rock": "rock","alternative pop": "pop","alternative r&b": "r&b",
    "2020s": "2020s","barbadian": "pop","dance-pop": "pop","1960s": "pop","2008 universal fire victim": "pop","english": "rap","country": "country","unknown": "unknown",
    "blues": "blues","acoustic rock": "rock","calypso": "pop","dance": "electronic","dirty south": "hip-hop","art rock": "rock","pop": "pop","music for soccer moms": "pop",
    "alternative hip hop": "hip-hop","folk pop": "pop","contemporary country": "country","alliteration": "blues","emo rap": "rap","east coast hip hop": "hip-hop","1990s": "pop",
    "alternative electronic": "electronic","art pop": "pop","british": "pop","complextro": "electronic","ballad": "pop","latin": "pop","composer": "pop","death by murder": "rap",
    "electronic": "electronic","adult contemporary": "pop","algerian": "rap","italo-disco": "pop","boy band": "pop","club/dance": "electronic","electro house": "electronic",
    "boom bap": "rap","audiobook": "rap","hip hop rnb and dance hall": "hip-hop","80s": "rock","afrobeat": "electronic","blue-eyed soul": "blues","g-funk": "rap","piano rock": "rock",
    "american": "blues","bro-country": "country","bass": "electronic","anti vax": "pop","american idol": "pop","classic pop and rock": "pop","alternative metal": "rock",
    "indie": "pop","crunkcore": "pop","british soul": "rap","alternative": "pop","brostep": "electronic","_consistency": "pop","black gospel": "blues","afrobeats": "electronic",
    "bitpop": "pop","indie pop": "pop","dance-rock": "rock","pop rock": "pop","edm": "electronic","celtic": "pop","audio drama": "rap","hard rock": "rock","pop rap": "rap",
    "country pop": "country","1970s": "pop","chicago drill": "rap","spike": "rap", "comedy rap": "rap", "indie rock":'rock', "indie folk":'pop', 'alt rock':'rock', 'aor': 'rap','90s': 'pop', "'":"pop", "_fix whosampled dupe url":'pop', "1980s": 'pop',
}

df["genre"] = df["genre"].map(genre_map).fillna(df["genre"])

### Detecting languages

In [16]:
# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

In [17]:
df['language'] = df['lyrics'].apply(lambda x: detect_language(x) if detect_language(x) else False)

In [18]:
df.value_counts('language')

language
en    480
es      3
sw      1
Name: count, dtype: int64

- Since, there are only 4 songs that are not English, it doesn't make sense to use a model from Hugging face to translate them. So we are going to drop this rows

In [19]:
# Drop rows where the language is not English
df = df[df['language'] == 'en'].reset_index(drop=True)
# Drop the 'language' column
df = df.drop(columns=['language'])

### Applying the model & Adding and tidying columns

- We are going to apply directly the model without more data engineering like lemmatization or stemming, since transformers are trained on raw, natural language. The model was pretrained on large corpora of naturally occurring text (books, social media, etc.) without lemmatization or stemming.

In [20]:
## Classifier model
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

# Function to truncate text to the token limit
def truncate_text(text, max_tokens=512):
	# Use tokenizer.encode to handle truncation and special tokens
	encoded = tokenizer.encode(text, truncation=True, max_length=max_tokens, add_special_tokens=True)
	return tokenizer.decode(encoded, skip_special_tokens=True)

# Apply the classifier to lyrics
emotion_scores = df['lyrics'].apply(lambda x: classifier(truncate_text(x)))
# Labels to columns and then to df to concat
emotion_scores = emotion_scores.apply(lambda x: {d['label']: d['score'] for d in x[0]})
emotion_scores = pd.DataFrame(list(emotion_scores))
df = pd.concat([df, emotion_scores], axis=1)




Device set to use cpu


In [21]:
## New columns for emotions
df['top_emotion'] = df[['anger', 'joy', 'sadness']].idxmax(axis=1)
## Drop unnecessary columns & round
df = df.drop(columns=['disgust','neutral','fear','surprise'])
df[['anger', 'joy', 'sadness']] = (df[['anger', 'joy', 'sadness']] * 100).round(2)
# Add a column with the number of words in each song's lyrics
df['num_words'] = df['lyrics'].str.split().apply(len)
df.head(10)

Unnamed: 0,year,pos,title,artist,lyrics,genre,anger,joy,sadness,top_emotion,num_words
0,2005,1,We Belong Together,Mariah Carey,"Sweet love, yeah I didn't mean it when...",pop,2.34,3.08,79.6,sadness,465
1,2005,2,Hollaback Girl,Gwen Stefani,"Uh-huh, this my shit All the girls, stomp ...",pop,16.38,6.33,13.48,anger,455
2,2005,3,Let Me Love You,Mario,"Mmm, ah Mmm, yeah Mmm Yeah, yeah, yeah ...",unknown,10.44,33.3,18.47,joy,492
3,2005,4,Since U Been Gone,Kelly Clarkson,"Here's the thing, we started out friendsI...",pop,8.33,16.04,14.95,joy,256
4,2005,5,"1, 2 Step",Ciara,"Ladies and gentlemen (Ladies and gentlemen,...",r&b,19.6,35.38,8.97,joy,487
5,2005,6,Gold Digger,Kanye West,"She take my money when I'm in needYeah, she's...",rap,15.95,9.82,18.2,sadness,738
6,2005,7,Boulevard of Broken Dreams,Green Day,I walk a lonely roadThe only one that I h...,rock,1.71,1.12,45.68,sadness,248
7,2005,8,Candy Shop,50 Cent,"Yeah, uh-huhSo seductive I'll take y...",2000s,5.73,42.15,6.62,joy,548
8,2005,9,Don't Cha,The Pussycat Dolls,"Okay ( Ahh ) Yeah ( Ahh ) Oh, we about to...",pop,11.44,27.39,12.05,joy,704
9,2005,10,Behind These Hazel Eyes,Kelly Clarkson,"Oh-oh, oh, oh, oh Oh-oh, oh, oh Seems...",pop,1.55,1.24,77.48,sadness,258


In [22]:
# Save the DataFrame to a CSV file
df.to_csv('billboard_top25_songs.csv', index=False)

##
<p style="background-color:#178268; font-family: arial black; color:#FFF9ED; font-size: 300%; text-align: center;">Spotify DataFrame</p>

### Extracting Top 100 Most History Stremed Songs by Scrapping Wikipedia

In [23]:
def top_songs_spotify():
    data = []
    # Paramteres for Web Scraping
    url = f"https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table", class_="wikitable")
    

    # Extracting the data from the table
    for row in table.find_all("tr")[1:100+1]:
        cols = row.find_all(["th", "td"])
        if not cols:
            continue
        
        # Extracting position, title, and artist. And adding the year to the data
        pos = int(cols[0].get_text(strip=True).replace(".", ""))
        title = cols[1].get_text(strip=True).strip('"')
        artist = cols[2].get_text(strip=True)
        
        data.append({"pos": pos, "title": title, "artist": artist})
    
    # data to df and sorting
    df = pd.DataFrame(data)
    df = df.sort_values(by=["pos"]).reset_index(drop=True)
    # Tidy artist names
    df['artist'] = df['artist'].str.split(r'featuring|and|&|with').str[0].str.strip()
    
    return df

In [24]:
df_spotify = top_songs_spotify()
df_spotify.head(10)

Unnamed: 0,pos,title,artist
0,1,Blinding Lights,The Weeknd
1,2,Shape of You,Ed Sheeran
2,3,Starboy,The Weeknd
3,4,Someone You Loved,Lewis Capaldi
4,5,As It Was,Harry Styles
5,6,Sunflower,Post Malone
6,7,Sweater Weather,The Neighbourhood
7,8,One Dance,Drake
8,9,Stay,The Kid Laroi
9,10,Perfect,Ed Sheeran


### Extracting Songs Lyrics by Scraping Genius.com

- We are going to do the same process as the previous dataframe of discarding the songs we weren't able to get succesfully the lyrics.

In [25]:
df_spotify = fetch_lyrics_for_df(df_spotify)

# Filter df_spotify to keep only the songs with lyrics
not_starting_with_label = df_spotify['lyrics'].dropna().apply(lambda x: not any(x.startswith(label) for label in labels))
not_starting_with_label.sum()
# Drop rows where lyrics do not start with any of the specified labels
df_spotify = df_spotify.loc[~not_starting_with_label].reset_index(drop=True)
# Tidy the lyrics
df_spotify['lyrics'] = df_spotify['lyrics'].str.replace('\n', ' ', regex=False)
df_spotify['lyrics'] = df_spotify['lyrics'].str.replace(r'\[.*?\]', ' ', regex=True)

df_spotify.head(10)

Unnamed: 0,pos,title,artist,lyrics
0,1,Blinding Lights,The Weeknd,Yeah I've been tryna call I've been...
1,2,Shape of You,Ed Sheeran,A club isn't the best place to find a lover...
2,3,Starboy,The Weeknd,Ayy I'm tryna put you in the worst m...
3,4,Someone You Loved,Lewis Capaldi,"I'm going under, and this time, I fear ther..."
4,5,As It Was,Harry Styles,"Come on, Harry, we wanna say goodnight to y..."
5,6,Sunflower,Post Malone,"Ayy, ayy, ayy, ayy (Ooh) Ooh, ooh, ooh, oo..."
6,7,Sweater Weather,The Neighbourhood,And all I am is a manI want the world in my...
7,8,One Dance,Drake,"Grips on your waist, front way, back wayYou..."
8,9,Stay,The Kid Laroi,I do the same thing I told you that I never...
9,10,Perfect,Ed Sheeran,"I found a love for meOh, darlin', just dive..."


### Adding the genre of the artist with musicbrainzngs

- Same process as in the earlier DataFrame

In [26]:
df_spotify = find_genres(df_spotify)
# Map genres to a common set
df_spotify["genre"] = df_spotify["genre"].map(genre_map).fillna(df_spotify["genre"])

Fetching genres: 100%|██████████| 99/99 [03:17<00:00,  1.99s/it]


### Dropping songs that aren't in English

In [27]:
df_spotify['language'] = df_spotify['lyrics'].apply(lambda x: detect_language(x) if detect_language(x) else False)
# Drop rows where the language is not English
df_spotify = df_spotify[df_spotify['language'] == 'en'].reset_index(drop=True)
# Drop the 'language' column
df_spotify = df_spotify.drop(columns=['language'])

### Applying the model & Adding and tidying columns

In [28]:
# Apply the classifier to lyrics
emotion_scores = df_spotify['lyrics'].apply(lambda x: classifier(truncate_text(x)))
# Labels to columns and then to df to concat
emotion_scores = emotion_scores.apply(lambda x: {d['label']: d['score'] for d in x[0]})
emotion_scores = pd.DataFrame(list(emotion_scores))
df_spotify = pd.concat([df_spotify, emotion_scores], axis=1)

## New columns for emotions
df_spotify['top_emotion'] = df_spotify[['anger', 'joy', 'sadness']].idxmax(axis=1)
## Drop unnecessary columns & round
df_spotify = df_spotify.drop(columns=['disgust','neutral','fear','surprise'])
df_spotify[['anger', 'joy', 'sadness']] = (df_spotify[['anger', 'joy', 'sadness']] * 100).round(2)
# Add a column with the number of words in each song's lyrics
df_spotify['num_words'] = df_spotify['lyrics'].str.split().apply(len)

df_spotify.head(10)

Unnamed: 0,pos,title,artist,lyrics,genre,anger,joy,sadness,top_emotion,num_words
0,1,Blinding Lights,The Weeknd,Yeah I've been tryna call I've been...,2010s,6.22,2.92,53.13,sadness,241
1,2,Shape of You,Ed Sheeran,A club isn't the best place to find a lover...,2010s,7.42,17.78,9.16,joy,672
2,3,Starboy,The Weeknd,Ayy I'm tryna put you in the worst m...,2010s,16.2,9.6,45.86,sadness,429
3,4,Someone You Loved,Lewis Capaldi,"I'm going under, and this time, I fear ther...",blues,0.91,1.02,4.55,sadness,314
4,5,As It Was,Harry Styles,"Come on, Harry, we wanna say goodnight to y...",pop,6.15,9.77,31.24,sadness,233
5,6,Sunflower,Post Malone,"Ayy, ayy, ayy, ayy (Ooh) Ooh, ooh, ooh, oo...",2010s,7.01,5.96,31.07,sadness,299
6,7,Sweater Weather,The Neighbourhood,And all I am is a manI want the world in my...,pop,10.28,9.48,38.64,sadness,338
7,8,One Dance,Drake,"Grips on your waist, front way, back wayYou...",r&b,6.33,17.46,24.53,sadness,405
8,9,Stay,The Kid Laroi,I do the same thing I told you that I never...,rap,7.52,1.83,54.16,sadness,399
9,10,Perfect,Ed Sheeran,"I found a love for meOh, darlin', just dive...",2010s,2.89,28.91,33.1,sadness,281


In [29]:
# Save the DataFrame to a CSV file
df_spotify.to_csv('spotify_most_stremed_songs.csv', index=False)