In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
import json
import re

In [2]:
# file containing artists - songs mapping
songs_json = "Artists-Songs Mapping.json"
songs_dict = {}

with open(songs_json) as file:
    songs_dict = json.load(file)
    

In [3]:
# artist for which the lyrics need to be written
artist = "michaeljackson"
songs = songs_dict[artist]
processed_songs = []

# preprocessing the songs name for scraping
for song in songs:
    numbers_in_brackets_removed = re.sub(r'\(.*\)',"",song)
    processed_song = re.sub(r'\W+', '', numbers_in_brackets_removed).lower()
    processed_songs.append(processed_song)
    
print(len(processed_songs))

# Removing duplicate songs
processed_songs = list(set(processed_songs))
print(len(processed_songs))
print(processed_songs[:20])

298
276
['lovesgonebad', 'iwannabewhereyouaredallasaustinremix', 'willyoubethere', 'blameitontheboogiethejacksons', 'earthsong', 'lovingyou', 'allthethingsyouare', 'whoslookingforalover', 'thisplacehotelthejacksons', 'wevehadenough', 'getonthefloor', 'peoplemaketheworldgoround', 'justalittlebitofyou', 'healtheworld', 'mygirl', '2bad', 'itsthefallinginlove', 'dangerous', 'hereiam', 'girlfriend']


In [4]:
# url to scrape the lyrics from
base_url = "https://www.azlyrics.com/lyrics/{}/{}.html"

# file in which the lyrics would be saved
lyrics_file = "lyrics_scraped9.txt"

lyrics_not_found_for = []

# delay after each execution of call for not exceeding the requests count and also not to overburden the server
delay = 10

with open(lyrics_file, "w") as file:
    
    for song in processed_songs:
        final_url = base_url.format(artist,song)

        try:
            html_page = urlopen(final_url)
            soup = BeautifulSoup(html_page, 'html.parser')

            html_pointer = soup.find('div', attrs={'class':'ringtone'})
            song_name = html_pointer.find_next('b').contents[0].strip()
            lyrics = html_pointer.find_next('div').text.strip()

            file.write("###"+song_name+"###")
            file.write("\n\n")
            file.write(lyrics)
            file.write("\n\n")
            
            print("Lyrics successfully written to file for : " + song_name)
            
        except:
            print("Lyrics not found for : " + song) 
            lyrics_not_found_for.append(song)
            
        finally:
            sleep(delay)
            

Lyrics not found for : lovesgonebad
Lyrics not found for : iwannabewhereyouaredallasaustinremix
Lyrics successfully written to file for : "Will You Be There"
Lyrics not found for : blameitontheboogiethejacksons
Lyrics successfully written to file for : "Earth Song"
Lyrics successfully written to file for : "Loving You"
Lyrics successfully written to file for : "All The Things You Are"
Lyrics not found for : whoslookingforalover
Lyrics not found for : thisplacehotelthejacksons
Lyrics successfully written to file for : "We've Had Enough"
Lyrics successfully written to file for : "Get On The Floor"
Lyrics successfully written to file for : "People Make The World Go Round"
Lyrics successfully written to file for : "Just A Little Bit Of You"
Lyrics successfully written to file for : "Heal The World"
Lyrics successfully written to file for : "My Girl"
Lyrics successfully written to file for : "2 Bad"
Lyrics successfully written to file for : "It's The Falling In Love"
Lyrics successfully wri

In [5]:
print("Total count : ", len(songs),"songs")
print("Lyrics successfully scraped for : ", len(songs)-len(lyrics_not_found_for), "songs")
print("Lyrics not found for :", len(lyrics_not_found_for),"songs\n")
print("\nHere's the list :\n")
print(lyrics_not_found_for)

Total count :  298 songs
Lyrics successfully scraped for :  178 songs
Lyrics not found for : 120 songs


Here's the list :

['lovesgonebad', 'iwannabewhereyouaredallasaustinremix', 'blameitontheboogiethejacksons', 'whoslookingforalover', 'thisplacehotelthejacksons', 'hereiam', 'isitscarythreatened', 'dancingmachine', 'beatitstateofshock', 'bloodonthedancefloorxdangerousthewhitepandamash', 'dancingmachinesteveaokiremix', 'givemehalfachance', 'sunsetdriverdemo', 'maybetomorrowsturkenrogersremix', 'vincentprice', 'healingtheworld', 'planetearthearthsong', 'theloveyousave', 'motowncalling', 'dancingmachinesingleversion', 'interviewwithquincyjonesandrodtemperton', 'nevercansaygoodbyetheneptunesremix', 'nevercansaygoodbyethejackson5', 'youarenotaloneijustcantstoplovingyou', 'workindayandnight', 'wearetheworld', 'itstoolatetochangethetimethejackson5', 'lonelyteardrops', 'billiejean2008withkanyewest', 'greatestshowonearth', 'pyt2008withmichaeljacksonandwilliampreviouslyunreleasedtrackfo', 'who