In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [2]:
#set paths here YOU WILL NEED TO CHANGE THESE TO WORK ON YOUR SYSTEM
BILLBOARD_HOT_100_1946_2022 = '/content/drive/MyDrive/musicProfanity (1)/Billboard_Hot100_Songs_Spotify_1946-2022.csv'
PATH_FOR_SAVING_DATA_SET = '/content/drive/MyDrive/musicProfanity (1)/billboard_top_100_1946_2022_lyrics.csv' # 'billboard_top_100_1946_2022_lyrics.csv' this will be the name of the file


In [3]:
#import libraries
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import ast
import time
import random
import unicodedata

In [None]:
#takes artist and song name and returns the url
def getURL(artist,song):
  artist = artist.replace(" ", "-") #we take the first artist name because it happens to work with the specific site being scraped
  song = song.replace(" ", "-")
  base = 'https://lyrics.lyricfind.com/lyrics/'
  return base + artist + "-" + song


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])


def remove_special_characters(name):
    cleaned_name = name.replace("'", '').replace('â€™', '').strip()
    cleaned_name = re.sub(r'[^a-zA-Z0-9\s]', ' ', cleaned_name)  # replace other special characters with a space
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name)  # replace multiple spaces with just one
    return cleaned_name.strip().lower() #JAN 19TH ADDED STRIP


def remove_unnecessary_song_info(song_name):
    if " - " in song_name:
        return song_name.split(" - ")[0]  # Remove everything after " - "
    elif " featuring " in song_name:
        return song_name.split(" featuring ")[0]  # Remove everything after " featuring "
    elif " remix " in song_name:
        return song_name.split(" remix ")[0] #remove everything after "remix"
    elif "[]" in song_name or "]" in song_name:
        return re.sub(r'\[[^\]]*\]', '', song_name) #remove square brackets and contents
    else:
        return song_name.strip().lower()  # Keep the original name if the patterns are not found''' #JAN 19TH ADDED STRIP AND LOWER

Load billboard csv and save as pandas dataframe called df

In [None]:
file_path = BILLBOARD_HOT_100_1946_2022
df = pd.read_csv(file_path)

#get lyrics dataframe removing extra spotify information
lyrics_df=df.loc[:, ['Song', 'Artist Names', 'Hot100 Ranking Year', 'Hot100 Rank']]

#Add a column to add the lyrics and fill cells with None
lyrics_df['Lyrics'] = None

#make strings of artist names that look like lists into actual lists
lyrics_df['Artist Names'] = lyrics_df['Artist Names'].apply(ast.literal_eval)

#get just songs and artist names
artists = lyrics_df['Artist Names']

#remove special characters from artists are replace letters with accents and other symbols with regular letters
for inner_list in artists:
    for i in range(len(inner_list)):
        inner_list[i] = remove_accents(inner_list[i])
        inner_list[i] = inner_list[i].replace('$', 's') #A$AP -> asap
        inner_list[i] = inner_list[i].replace('!', 'i') #p!nk -> pink
        inner_list[i] = remove_special_characters(inner_list[i])
        inner_list[i] = inner_list[i].replace(" and ", " ")# need space before and after to prevent removing first part of a name like andrew

In [None]:
songs = lyrics_df['Song']

# Use list comprehension to process the song list and remove certain sequences of character and everything after the sequence
processed_songs = [remove_unnecessary_song_info(song) for song in songs] #song_list]

processed_songs = [remove_accents(song) for song in processed_songs] #song_list]


The code below gets the songs it take about 1h30 to try and get lyrics for nearly 7000 songs

In [None]:
stop_searching_song = None

for i in lyrics_df[pd.isna(lyrics_df['Lyrics'])].index:#(len(df)):
  stop_searching_song = False
  print(i)#this will print out the song index only so the user can see progress

  song_title = processed_songs[i]


  for artist in artists.iloc[i]:
    if stop_searching_song == True:
      break

    elif "(" in song_title or ")" in song_title:
      #remove everything in brackets
      song_title_no_brackets_contents = re.sub(r'\s+', ' ', re.sub(r'\((.*?)\)', '', song_title)).strip()
      #remove only brackets
      song_title_no_brackets = re.sub(r'\s+', ' ', re.sub(r'\((.*?)\)', r'\1', song_title)).strip()
      #if there are multiple brackets keep the first only
      song_title_first_brackets_only = re.sub(r'\(.*?\)', '', re.sub(r'\((.*?)\)', r'(\1)', song_title, 1)).strip()
      #if there are multiple brackets keep the last only
      song_title_last_brackets_only = re.sub(r'\(.*?\)', '', re.sub(r'^(.*)\((.*?)\)', r'\1(\2)', song_title)).strip()
      # Move the contents within brackets to the end of the string
      match = re.search(r'\((.*?)\)', song_title)
      if match:
        contents_within_brackets = match.group(1)
        song_title_brackets_moved = re.sub(r'\(.*?\)', '', song_title).strip() + ' ' + contents_within_brackets

        title_versions = [song_title_no_brackets_contents,
                            song_title_no_brackets,
                            song_title_first_brackets_only,
                            song_title_last_brackets_only,
                            song_title_brackets_moved]

        for title in title_versions:
          title = remove_special_characters(title) #do this here so that the brackets can be used for if else
          url = getURL(artist,title)
          page = requests.get(url)

          soup = BeautifulSoup(page.content, "html.parser") # Your HTML content containing the script tag
          script_tag = soup.find('script', id='__NEXT_DATA__')# Find the script tag with the specified id
          json_data = json.loads(script_tag.string)#get the data in json format
          responseCode = json_data['props']['pageProps']['songData']['response']['code']#response to check if successful or not

          if responseCode == 101:
            stop_searching_song = True
            lyrics = re.sub(r'[^a-zA-Z0-9\s]', '', json_data['props']['pageProps']['songData']['track']['lyrics'])
            lyrics = lyrics.lower().split()  # Convert the entire string to lowercase, then split into a list of words
            lyrics_df['Lyrics'][i] = lyrics
            break


    else:
      song_title=remove_special_characters(song_title) #do this here so that the brackets can be used for if else
      url = getURL(artist,song_title)
      page = requests.get(url)

      soup = BeautifulSoup(page.content, "html.parser") # Your HTML content containing the script tag
      script_tag = soup.find('script', id='__NEXT_DATA__')# Find the script tag with the specified id
      json_data = json.loads(script_tag.string)#get the data in json format
      responseCode = json_data['props']['pageProps']['songData']['response']['code']#response to check if successful or not

      if responseCode == 101:
        stop_searching_song = True
        lyrics = re.sub(r'[^a-zA-Z0-9\s]', '', json_data['props']['pageProps']['songData']['track']['lyrics'])
        lyrics = lyrics.lower().split()  # Convert the entire string to lowercase, then split into a list of words
        lyrics_df['Lyrics'][i] = lyrics
        break



In [None]:
#save the dataset as billboard_top_100_1946_2022_lyrics.csv
lyrics_df.to_csv(PATH_FOR_SAVING_DATA_SET, index=False)


In [None]:
print('total songs in dataset: ',len(lyrics_df))
print('Percent songs obtained: ',(100-((len(lyrics_df[pd.isna(lyrics_df['Lyrics'])].index)/len(lyrics_df)*100))),"%")
print('number of songs obtained: ',len(lyrics_df)-len(lyrics_df[pd.isna(lyrics_df['Lyrics'])].index))
print('Percent songs missed: ',((len(lyrics_df[pd.isna(lyrics_df['Lyrics'])].index)/len(lyrics_df)*100)),"%")
print('number of songs missed: ',len(lyrics_df[pd.isna(lyrics_df['Lyrics'])].index))