In [95]:
import sys
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install lyricsgenius
import requests
import re
import lyricsgenius
import csv
from bs4 import BeautifulSoup

def get_billboard_song_titles_for_year(year):
    """
    The Year-end Wiki scrapper
    https://github.com/awcooper/spotify_billboard_playlist_generator/blob/master/playlist_generators/hot_100_scraper_v1.py
    :param year: the year to scrap for
    :return: List of billboard songs and artists in a tuple '(song, artist)'
    """
    billboard_page = "https://www.wikizero.com/en/Billboard_Year-End_Hot_100_singles_of_"
    page = requests.get(billboard_page + str(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    doc = soup.find("table", {"class": "wikitable"})
    year_data = []
    for row in doc.find_all(["tr"])[1:]:
        # The th is required because ~2000+ uses that format instead
        row_data = [cell.text.strip() for cell in row.findAll(["td", "th"])]
        if len(row_data) != 3:
            print("Error Processing Row: ", row)
        else:
            year_data.append(tuple(row_data))
    return year_data

def get_main_artist(content):
    """
    Extracts the main artist for a song
    :param content: artist + feats and others
    :return: main artist
    """
    for split_token in [" & ", " \\ ", " feat ", " featuring ", " and "]:
        content = content.partition(split_token)[0]
    return content

def clean_lyrics(lyrics):
    """
    Takes lyrics and removes annotations and empty lines
    :param lyrics: original lyrics
    :return: cleaned lyrics
    """
    #Remove annotations
    lyrics =  re.sub(r'[\(\[].*?[\)\]]', '',lyrics)
    lyrics = re.sub(r"[^a-zA-Z0-9]+", ' ', lyrics)
    #Remove empty lines and commas
    lyrics = [line.replace(',','') for line in lyrics.split('\n') if line.strip() != '']
    lyrics = "\n".join(lyrics)
    
    return lyrics

def write_lyrics(year):
    """
    Writes a year's lyrics into a .csv file
    :param year: specified year
    """
    token = "MY_bmSPzzEQDrlRLp9xS77g-vQqtCtimk8Yw4toCkyduF7eUbrPI-IPxXATgAESi"
    genius = lyricsgenius.Genius(token)
    
    with open('lyrics/'+year+'.csv', 'w', newline="",encoding="utf-8") as csvfile:
        filewriter = csv.writer(
            csvfile,
            delimiter=',',
            quotechar='|',
            quoting=csv.QUOTE_MINIMAL
        )
        
        filewriter.writerow(['Rank', 'Song', 'Artist', 'Year','Lyrics'])
        year_songs = get_billboard_song_titles_for_year(year)
        for song in year_songs:
            song_rank = song[0]
            song_name = song[1]
            song_artist = get_main_artist(song[2])
            song_lyrics = ""
            try:
                fetched_lyrics = genius.search_song(song_name, song_artist)
                if fetched_lyrics is not None and fetched_lyrics.lyrics !="instrumental":
                    song_lyrics = clean_lyrics(fetched_lyrics.lyrics)
            except:
                fetched_lyrics = ""
            try:
                song_lyrics = song_lyrics.encode('utf-8')
            except:
                song_lyrics = "".encode('utf-8')
            try:
                filewriter.writerow([song_rank, song_name.replace(',',''), song_artist.replace(',',''), year,song_lyrics])
            except:
                print(song_lyrics)
            
def combine_files(first_year,last_year):
    """
    Combines years' .csv files
    :param first_year: first year
    :param last_year: last year
    """
    with open('lyrics/bh100.csv', 'w', newline="",encoding="utf-8") as csvfile:
        filewriter = csv.writer(
            csvfile,
            delimiter=',',
            quotechar='|',
            quoting=csv.QUOTE_MINIMAL
        )
        for number in range(first_year,last_year+1):
            with open("lyrics/"+str(number)+".csv", 'r') as bh100:
                bh_reader = csv.reader(bh100, delimiter=',')
                #sktpping the header
                if number != first_year:
                    next(bh_reader)
                try:
                    for line in bh_reader:
                        filewriter.writerow([line[0], line[1], line[2], line[3],line[4]])
                except:
                    print(line)
    

Collecting beautifulsoup4==4.6.0 (from lyricsgenius)
  Using cached https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl


No metadata found in c:\users\sdvii\miniconda3\lib\site-packages
Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'c:\\users\\sdvii\\miniconda3\\lib\\site-packages\\beautifulsoup4-4.7.1.dist-info\\METADATA'

Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'c:\\users\\sdvii\\miniconda3\\lib\\site-packages\\beautifulsoup4-4.7.1.dist-info\\METADATA'



In [69]:
def main():
    """
    Main Method
    """
    # The splitting via years was made in case of request timeouts
    first_year = 2006
    last_year = 2015
    for year in range(first_year,last_year+1):
        write_lyrics(str(year))
    
main()


Searching for ""Bad Day"" by Daniel Powter...
Done.
Searching for ""Temperature"" by Sean Paul...
Done.
Searching for ""Promiscuous"" by Nelly Furtado...
Done.
Searching for ""You're Beautiful"" by James Blunt...
Done.
Searching for ""Hips Don't Lie"" by Shakira...
Done.
Searching for ""Unwritten"" by Natasha Bedingfield...
Done.
Searching for ""Crazy"" by Gnarls Barkley...
Done.
Searching for ""Ridin'"" by Chamillionaire...
Done.
Searching for ""SexyBack"" by Justin Timberlake...
Done.
Searching for ""Check on It"" by Beyoncé...
Done.
Searching for ""Be Without You"" by Mary J. Blige...
Done.
Searching for ""Grillz"" by Nelly, Paul Wall...
Done.
Searching for ""Over My Head (Cable Car)"" by The Fray...
Done.
Searching for ""Me & U"" by Cassie...
Done.
Searching for ""Buttons"" by Pussycat Dolls...
Done.
Searching for ""Run It!"" by Chris Brown...
Done.
Searching for ""So Sick"" by Ne-Yo...
Done.
Searching for ""It's Goin' Down"" by Yung Joc...
Done.
Searching for ""SOS"" by Rihanna...

In [96]:
first_year = 1965
last_year = 2018
combine_files(first_year,last_year)


['28', "Baby Don't Forget My Number", 'Milli Vanilli', '1989', "b' Babe don t be shy When you re holding my hand Cause this time goes back You got to understand It s you Ba ba ba baby In your eyes I see it so clearly That our love it s so strong And you never go wrong I got the best for you So I m waiting down If you need someone Baby call my line Call me anytime I ll be there for you You you I ve been searching high High high I ve been searching low Ba ba ba ba baby Don t forget my number Baby don t be stronger than a thunder Ba ba ba ba baby Don t forget my number Love will see you through I ve been searching high I ve been searching low I want to spend spend my life with you Ba ba ba ba ba ba ba ba My desper youth Ba ba ba ba ba ba ba ba Love will see you through Ba ba baby In your eyes I see it so clearly '"]
