In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
import json
import re

In [2]:
# file containing artists - songs mapping
songs_json = "Artists-Songs Mapping.json"
songs_dict = {}

with open(songs_json) as file:
    songs_dict = json.load(file)
    

In [3]:
# artist for which the lyrics need to be written
artist = "edsheeran"
songs = songs_dict[artist]
processed_songs = []

# preprocessing the songs name for scraping
for song in songs:
    numbers_in_brackets_removed = re.sub(r'\(.*\)',"",song)
    processed_song = re.sub(r'\W+', '', numbers_in_brackets_removed).lower()
    processed_songs.append(processed_song)
    
print(len(processed_songs))

# Removing duplicate songs
processed_songs = list(set(processed_songs))
print(len(processed_songs))
print(processed_songs[:20])

70
70
['belikeyou', 'barcelona', 'bibiabeyeye', 'takeitbacksuperstition', 'coldcoffee', 'savemyself', 'theman', 'allofthestars', 'goodbyetoyou', 'uni', 'imamess', 'happier', 'nightmares', 'heartsdontbreakaroundhere', 'newman', 'howwouldyoufeel', 'littlelady', 'fall', 'givemelove', 'nancymulligan']


In [4]:
# url to scrape the lyrics from
base_url = "https://www.azlyrics.com/lyrics/{}/{}.html"

# file in which the lyrics would be saved
lyrics_file = "lyrics_scraped2.txt"

lyrics_not_found_for = []

# delay after each execution of call for not exceeding the requests count and also not to overburden the server
delay = 10

with open(lyrics_file, "w") as file:
    
    for song in processed_songs:
        final_url = base_url.format(artist,song)

        try:
            html_page = urlopen(final_url)
            soup = BeautifulSoup(html_page, 'html.parser')

            html_pointer = soup.find('div', attrs={'class':'ringtone'})
            song_name = html_pointer.find_next('b').contents[0].strip()
            lyrics = html_pointer.find_next('div').text.strip()

            file.write("###"+song_name+"###")
            file.write("\n\n")
            file.write(lyrics)
            file.write("\n\n")
            
            print("Lyrics successfully written to file for : " + song_name)
            
        except:
            print("Lyrics not found for : " + song) 
            lyrics_not_found_for.append(song)
            
        finally:
            sleep(delay)
            

Lyrics successfully written to file for : "Be Like You"
Lyrics successfully written to file for : "Barcelona"
Lyrics successfully written to file for : "Bibia Be Ye Ye"
Lyrics not found for : takeitbacksuperstition
Lyrics successfully written to file for : "Cold Coffee"
Lyrics successfully written to file for : "Save Myself"
Lyrics successfully written to file for : "The Man"
Lyrics successfully written to file for : "All Of The Stars"
Lyrics successfully written to file for : "Goodbye To You"
Lyrics successfully written to file for : "UNI"
Lyrics successfully written to file for : "I'm A Mess"
Lyrics successfully written to file for : "Happier"
Lyrics successfully written to file for : "Nightmares"
Lyrics successfully written to file for : "Hearts Don't Break Around Here"
Lyrics successfully written to file for : "New Man"
Lyrics not found for : howwouldyoufeel
Lyrics successfully written to file for : "Little Lady"
Lyrics successfully written to file for : "Fall"
Lyrics successfully 

In [5]:
print("Total count : ", len(songs),"songs")
print("Lyrics successfully scraped for : ", len(songs)-len(lyrics_not_found_for), "songs")
print("Lyrics not found for :", len(lyrics_not_found_for),"songs\n")
print("\nHere's the list :\n")
print(lyrics_not_found_for)

Total count :  70 songs
Lyrics successfully scraped for :  55 songs
Lyrics not found for : 15 songs


Here's the list :

['takeitbacksuperstition', 'howwouldyoufeel', 'dontgobreakingmyheart', 'layitallonme', 'bloodstream', 'radio', 'she', 'this', 'wakemeup', 'dont', 'firefly', 'onenight', 'littlebird', 'smallbump', 'nina']
