# Metrolyrics (Pagination, scraping 1x per row)
Link : http://www.metrolyrics.com/rage-against-the-machine-lyrics.html

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
# http://www.metrolyrics.com/rage-against-the-machine-alpage-1.html

In [3]:
rows = []

for page_num in range(1,4):
    url = "http://www.metrolyrics.com/nirvana-alpage-" + str(page_num)
    
    response = requests.get(url)
    doc = BeautifulSoup(response.text)

    songs = doc.find("tbody").find_all('tr')
    for song in songs:
        row = {}

        row['title'] = song.find_all('td')[1].find('a').text.strip()
        row['slug'] = song.find_all('td')[1].find('a')['href']
        row['year'] = song.find_all('td')[2].text.strip()
        row['popularity'] = song.find_all('span')[1].get('style')

        rows.append(row)
        
print(rows)

[{'title': '(new Wave) Polly [mark Goodier Session] Lyrics', 'slug': 'http://www.metrolyrics.com/new-wave-polly-mark-goodier-session-lyrics-nirvana.html', 'year': '2006', 'popularity': 'width:6%;'}, {'title': '02 Lyrics', 'slug': 'http://www.metrolyrics.com/02-lyrics-nirvana.html', 'year': '2017', 'popularity': 'width:6%;'}, {'title': 'About A Girl Lyrics', 'slug': 'http://www.metrolyrics.com/about-a-girl-lyrics-nirvana.html', 'year': '1991', 'popularity': 'width:78.283044535628%;'}, {'title': 'Aero Zeppelin Lyrics', 'slug': 'http://www.metrolyrics.com/aero-zeppelin-lyrics-nirvana.html', 'year': '2006', 'popularity': 'width:6%;'}, {'title': "Ain't It A Shame Lyrics", 'slug': 'http://www.metrolyrics.com/aint-it-a-shame-lyrics-nirvana.html', 'year': '2007', 'popularity': 'width:16.945767347523%;'}, {'title': 'Alcohol / High On The Hog Lyrics', 'slug': 'http://www.metrolyrics.com/alcohol-high-on-the-hog-lyrics-nirvana.html', 'year': '2006', 'popularity': 'width:6%;'}, {'title': 'All Apolo

In [3]:
df = pd.DataFrame(rows, columns=['title', 'year', 'popularity', 'slug'])
df.head()

Unnamed: 0,title,year,popularity,slug
0,(new Wave) Polly [mark Goodier Session] Lyrics,2006,width:6%;,http://www.metrolyrics.com/new-wave-polly-mark...
1,02 Lyrics,2017,width:6%;,http://www.metrolyrics.com/02-lyrics-nirvana.html
2,About A Girl Lyrics,1991,width:78.283044535628%;,http://www.metrolyrics.com/about-a-girl-lyrics...
3,Aero Zeppelin Lyrics,2006,width:6%;,http://www.metrolyrics.com/aero-zeppelin-lyric...
4,Ain't It A Shame Lyrics,2007,width:16.945767347523%;,http://www.metrolyrics.com/aint-it-a-shame-lyr...


In [4]:
df.title = df.title.str.extract("(.*)\sLyrics")
df.year = df.year.astype(int)

## Bonus
df.popularity = df.popularity.str.extract(":(.*)%").astype(float)

df.slug = df.slug.str.extract(".com(.*)")

In [5]:
df.head()

Unnamed: 0,title,year,popularity,slug
0,(new Wave) Polly [mark Goodier Session],2006,6.0,/new-wave-polly-mark-goodier-session-lyrics-ni...
1,02,2017,6.0,/02-lyrics-nirvana.html
2,About A Girl,1991,78.283045,/about-a-girl-lyrics-nirvana.html
3,Aero Zeppelin,2006,6.0,/aero-zeppelin-lyrics-nirvana.html
4,Ain't It A Shame,2007,16.945767,/aint-it-a-shame-lyrics-nirvana.html


In [6]:
df.to_csv("slugs.csv", index=False)

In [7]:
df = pd.read_csv("slugs.csv")
df.head()

Unnamed: 0,title,year,popularity,slug
0,(new Wave) Polly [mark Goodier Session],2006,6.0,/new-wave-polly-mark-goodier-session-lyrics-ni...
1,02,2017,6.0,/02-lyrics-nirvana.html
2,About A Girl,1991,78.283045,/about-a-girl-lyrics-nirvana.html
3,Aero Zeppelin,2006,6.0,/aero-zeppelin-lyrics-nirvana.html
4,Ain't It A Shame,2007,16.945767,/aint-it-a-shame-lyrics-nirvana.html


In [14]:
rows = []

def scrape_page(row):
    url = "http://www.metrolyrics.com" + row['slug']
    
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    
    page = {}

    try:
        for lyrics in doc.find(id="lyrics-body-text").find_all(class_="verse"):
            page['lyrics'] = lyrics.text.strip()
    except: pass
    
    rows.append(page)

    return pd.Series(page)

In [15]:
lyrics_df = df.apply(scrape_page, axis=1)

In [16]:
all_df = df.merge(lyrics_df, left_index=True, right_index=True)
all_df.head(10)

Unnamed: 0,title,year,popularity,slug,lyrics
0,(new Wave) Polly [mark Goodier Session],2006,6.0,/new-wave-polly-mark-goodier-session-lyrics-ni...,Polly says her back hurts\nAnd she's just as b...
1,02,2017,6.0,/02-lyrics-nirvana.html,
2,About A Girl,1991,78.283045,/about-a-girl-lyrics-nirvana.html,I need an easy friend\nI do with an ear to len...
3,Aero Zeppelin,2006,6.0,/aero-zeppelin-lyrics-nirvana.html,Hey ...\nHey ...\nHey ...\nHey ...\nHey ...\nH...
4,Ain't It A Shame,2007,16.945767,/aint-it-a-shame-lyrics-nirvana.html,Ain't it a shame to go fishin on a Sunday\nAin...
5,Alcohol / High On The Hog,2006,6.0,/alcohol-high-on-the-hog-lyrics-nirvana.html,My baby taught me how to choke\nMy baby taught...
6,All Apologies,1991,78.186338,/all-apologies-lyrics-nirvana.html,All in all is all we are\nAll in all is all we...
7,Aneurism,2006,6.0,/aneurism-lyrics-nirvana.html,She keeps it pumpin' straight to my heart\nShe...
8,Aneurysm,2009,50.255169,/aneurysm-lyrics-nirvana.html,She keeps it pumpin' straight to my heart\nShe...
9,Aneurysm [mark Goodier Session],2006,6.0,/aneurysm-mark-goodier-session-lyrics-nirvana....,She keeps it pumpin' straight to my heart\nShe...


In [17]:
all_df.lyrics = all_df.lyrics.str.split('\n')
all_df.head()

Unnamed: 0,title,year,popularity,slug,lyrics
0,(new Wave) Polly [mark Goodier Session],2006,6.0,/new-wave-polly-mark-goodier-session-lyrics-ni...,"[Polly says her back hurts, And she's just as ..."
1,02,2017,6.0,/02-lyrics-nirvana.html,
2,About A Girl,1991,78.283045,/about-a-girl-lyrics-nirvana.html,"[I need an easy friend, I do with an ear to le..."
3,Aero Zeppelin,2006,6.0,/aero-zeppelin-lyrics-nirvana.html,"[Hey ..., Hey ..., Hey ..., Hey ..., Hey ..., ..."
4,Ain't It A Shame,2007,16.945767,/aint-it-a-shame-lyrics-nirvana.html,"[Ain't it a shame to go fishin on a Sunday, Ai..."


In [18]:
all_df.to_csv('metrolyrics.csv', index=False)