# AZLyrics Scraper

With this code you could scrape individual lyrics pages or whole artist pages for some of their basic data. First we start by importing the relevant libraries:

In [None]:
import sys
import csv
import requests
import re
import time
import random
from bs4 import BeautifulSoup

Then we make functions to retrieve the urls from the web and look for certain elements within the webpages:

In [None]:
def load_page(url):
    with requests.get(url) as f:
        page = f.text
    return page

def get_element_text(element):
    try:
        return element.text.strip()
    except AttributeError as e:                     
        print('Element not found, error: {}'.format(e), file=sys.stderr)
        return ''

## Getting the individual lyrics
We proceed to make a function to get the basic information from each song on a songpage from AZLyrics:

In [None]:
def get_song_info(url):
    song_page = BeautifulSoup(load_page(url), 'lxml')                  
    interesting_html = song_page.find(class_='container main-page')    
    if not interesting_html:
        print('No information availible for song at {}'.format(url), file=sys.stderr)
        return {}                                                      
    album = get_element_text(interesting_html.find(class_='songinalbum_title'))[8:-8]
    album_released = get_element_text(interesting_html.find(class_='songinalbum_title'))[-5:-1]
    credits = get_element_text(interesting_html.find('small'))[11:] 
    lyrics = get_element_text(interesting_html.find('div', {'class':None}))
    return {'album': album, 'album release': album_released,'credits': credits, 'lyrics': lyrics}                      

The previous functions can be tested by using the following code:

In [None]:
song_url = 'https://www.azlyrics.com/lyrics/genesis/wherethesourturnstosweet.html' #you should be able to replace this link with that of your favorite song
song_info = get_song_info(song_url)
for key, value in song_info.items():
    if key == 'lyrics': #you can replace 'lyrics' with any one of the keys from the dictionary we just made
        print(value)

## Getting all artist songs


In [None]:
def get_songs(url):
    index_page = BeautifulSoup(load_page(url), 'lxml')        
    items = index_page.find(id="listAlbum")                   
    if not items:                                             
        print('Something went wrong!', file=sys.stderr)
        sys.exit()
    data = []
    for row in items.find_all(class_= 'listalbum-item'):          
        song = get_element_text(row.find('a'))
        link = row.find('a').get('href')
        link = 'https://www.azlyrics.com/' + str(link)
        data.append({    
                         'song': song,
                         'link': link,
                        })
    return data

## Scraping

The following code scrapes AZLyrics for the data for all the given artist's songs. This may take a while depending on the amount of songs released by the artist.

In [None]:
index_url = 'https://www.azlyrics.com/g/genesis.html' #replace the link with the azlyrics link to the band you  want to scrape
song_data = get_songs(index_url)                      
for row in song_data:
    print('Scraping info on {}.'.format(row['song'])) #can be useful for debugging
    url = row['link']
    song_info = get_song_info(url)                    
    for key, value in song_info.items():
        row[key] = value
    time.sleep(random.uniform(4,16))


## Writing data into CSV

In this last part we will write down the data we have just scraped in a csv file and convert it to a table using the pandas module in python. From this dataframe we could acces the data easily and perform operations on them.

In [None]:
with open('songs.csv', 'w', encoding='utf-8') as f:       
    fieldnames=['song', 'album', 'album release', 'credits', 'lyrics']
    writer = csv.DictWriter(f,
                            delimiter=',',                
                            quotechar='"',                
                            quoting=csv.QUOTE_NONNUMERIC, 
                            fieldnames=fieldnames
                            )
    writer.writeheader()                                  
    for row in song_data:
        writer.writerow({k:v for k,v in row.items() if k in fieldnames})

## Optional: Creating a Dataframe from the CSV file

In [None]:
import pandas as pd

dataset = pd.read_csv('songs.csv') 
#dataset = dataset.dropna()
#dataset['album release'] = dataset['album release'].astype(int)
#dataset['lyrics'] = dataset['lyrics'].astype('string')
dataset

 