In [19]:
import requests
import re
import time
import os
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

import nltk
from nltk.corpus import stopwords

In [2]:
def grab_songs(url):
    """grab_songs(url) 
    Takes the url of an artist on https://www.lyrics.com, 
    grabs the urls of the song pages 
    and returns the urls and song names as dataframes
    """
    response = requests.get(url)
    assert response.status_code == 200 
    if url[-1].isnumeric():
        artist = url.rsplit('/')[-2]
        artist = artist.replace('-',' ')
    else:
        artist = url.rsplit('/')[-1]
        artist = artist.replace('-',' ')
    artist_page = response.text
    url_pattern = r'href="(/lyric.+?)">'
    url_endings = re.findall(url_pattern, artist_page)
    url_beginning = 'https://www.lyrics.com'
    song_url = [url_beginning + ending for ending in url_endings]
    artists = [artist for items in url_endings]
    
    from urllib.parse import unquote
    song_names = [unquote(x.rsplit('/')[-1], 'utf-8').replace('+', ' ') for x in url_endings]
    df = pd.DataFrame(list(zip(song_url, song_names, artists)),
               columns =['song_url', 'song_names', 'artist'])
    df.drop_duplicates(subset = 'song_names', inplace=True)
    return df

In [3]:
def save_songfiles(dataframe):
    '''
    Takes a dataframe containing song_url, song_name and artist
    safes a file of the web page for each song named by the artist and the song name in the folder 'songfiles'
    '''
    # create songfiles folder if needed
    directory_path = os.getcwd()
    folderpath = directory_path + "/songfiles"
    if os.path.isdir(folderpath) == False:
        os.mkdir(folderpath)
        print('songfolder created')
    else:
        print('songfolder already exists')
    
    #loop through rows of dataframe and safe web page of songs to folder
    for index, row in dataframe.iterrows():
        # extract row values
        url = row['song_url']
        song_name = row['song_names']
        artist = row['artist']
        
        # check if web page response is positiv and extract web page content
        response = requests.get(url)
        if response.status_code == 200:
            song_page = response.text
            
            # define filepath
            filename = artist + '_' + song_name + '.html'
            filepath = folderpath + "/" + filename
            
            # create file with web page content
            open(filepath, 'w', encoding = 'utf8').write(song_page)
            print(filename,' saved')
            time.sleep(10)
    print('done')


In [4]:
def extract_songtext(dataframe):
    '''
    Takes a dataframe containing song_url, song_name and artist
    extends the dataframe by the songtext extracted from the web page saved in the folder 'songfiles'
    '''
    # check if songfiles folder exists
    directory_path = os.getcwd()
    folderpath = directory_path + "/songfiles"
    try:
        os.path.exists(folderpath)
    except OSError as error: 
        print(error)
    
    # then extract songtext
    else:
        print('songfolder exists')
        
        # expand dataframe by new column
        dataframe['songtext'] = ""
        
        # loop through rows of dataframe and extract songtest from saved files
        for index, row in dataframe.iterrows():
            # extract row values
            url = row['song_url']
            song_name = row['song_names']
            artist = row['artist']

            # read web page content from file
            filename = artist + '_' + song_name + '.html'
            filepath = folderpath + "/" + filename
            try:
                songpage = open(filepath, 'r', encoding = 'utf8').read()

                # extract song text from web page content
                lyric_body_pattern = r'<pre id="lyric-body-text".+<\/pre>'
                if len(re.findall(lyric_body_pattern, songpage, re.DOTALL)) != 0:
                    # extract lyric-body-text from web page
                    lyric_body = re.findall(lyric_body_pattern, songpage, re.DOTALL)[0]
                    # get rid of hyperlinks in lyric-body-text
                    hyperlink_pattern = r'<.+?>'
                    lyrics = re.sub(hyperlink_pattern, '', lyric_body)
                else: 
                    lyrics = ""

                # write song text in dataframe
                dataframe.at[index,'songtext'] = lyrics
                print(filename,' song text extracted')
            except IOError:
                print(filename," doesn't exist")
                dataframe.at[index,'songtext'] = np.NaN
        dataframe = dataframe.dropna(axis = 0, inplace = False)
        print('done')
        # return dataframe
        return dataframe

In [5]:
def download_songfiles(urls):
    for url in urls:
        save_songfiles(grab_songs(url))


In [6]:
def grab_songtext(urls):
    '''
    Takes multiple urls and grabs the songtext from saved files (download_songfiles() needs to run before)
    Returns a dataframe
    '''
    directory_path = os.getcwd()
    folderpath = directory_path + "/songfiles"
    filepath = folderpath + "/lyrics.csv"
    df_lyrics = pd.DataFrame(columns = ['song_url', 'song_names', 'artist', 'songtext'])
    for url in urls:
        df_temp = extract_songtext(grab_songs(url).iloc[:100,:])
        df_lyrics = pd.concat([df_lyrics,df_temp])
    df_lyrics.reset_index(drop=True, inplace=True)
    df_lyrics.to_csv(filepath)
    return df_lyrics

In [7]:
def load_lyrics():
    directory_path = os.getcwd()
    folderpath = directory_path + "/songfiles"
    filepath = folderpath + "/lyrics.csv"
    df = pd.read_csv(filepath)
    return df

# run functions with test size
#url2 = 'https://www.lyrics.com/artist/Pashanim/2137847388'
#url1 = 'https://www.lyrics.com/artist/Yin-Kalle/2137849384'
#urls = [url1, url2]

#grab_songtext(urls)
#download_songfiles(urls)

In [8]:
# Run Functions
url2 = 'https://www.lyrics.com/artist/Pashanim/2137847388'
url1 = 'https://www.lyrics.com/artist/Yin-Kalle/2137849384'
urls = [url1, url2]

grab_songtext(urls)
download_songfiles(urls)
grab_songtext(urls)


songfolder exists
Yin Kalle_Doppel K.html  song text extracted
Yin Kalle_44BABY.html  song text extracted
Yin Kalle_Abischnitt.html  song text extracted
Yin Kalle_LESH.html  song text extracted
Yin Kalle_Müde.html  song text extracted
done
songfolder exists
Pashanim_Airwaves.html  song text extracted
Pashanim_Hauseingang.html  song text extracted
Pashanim_HENTAI.html  song text extracted
Pashanim_sportback.html  song text extracted
Pashanim_istanbul freestyle.html  song text extracted
Pashanim_Sommergewitter.html  song text extracted
Pashanim_Homicides.html  song text extracted
Pashanim_junge ceos.html  song text extracted
Pashanim_Shababs botten.html  song text extracted
Pashanim_paris freestyle - skrilla remix.html  song text extracted
done
songfolder already exists
Yin Kalle_Doppel K.html  saved
Yin Kalle_44BABY.html  saved
Yin Kalle_Abischnitt.html  saved
Yin Kalle_LESH.html  saved
Yin Kalle_Müde.html  saved
done
songfolder already exists
Pashanim_Airwaves.html  saved
Pashanim_Haus

KeyboardInterrupt: 

In [9]:
load_lyrics()

Unnamed: 0.1,Unnamed: 0,song_url,song_names,artist,songtext
0,0,https://www.lyrics.com/lyric/37464360/Yin+Kall...,Doppel K,Yin Kalle,"(Jaynbeats)\n\nKasi, Kalle, Doppel-K, kill, ki..."
1,1,https://www.lyrics.com/lyric/38188151/Yin+Kall...,44BABY,Yin Kalle,"Ah-ah-ah\nWoah, ah-ah\nIch ficke mein' Kopf we..."
2,2,https://www.lyrics.com/lyric/38401250/Yin+Kall...,Abischnitt,Yin Kalle,"(Sizzy)\n(Macloud, Miksu, pass out)\nAh\n\nRin..."
3,3,https://www.lyrics.com/lyric-lf/2766757/Yin+Ka...,LESH,Yin Kalle,"(Y-Y-You know who's on da beats)\n(Ayo, George..."
4,4,https://www.lyrics.com/lyric-lf/3334381/Yin+Ka...,Müde,Yin Kalle,"Ja, ich bin müde, brr\nJa, ich bin müde (for r..."
5,5,https://www.lyrics.com/lyric/37136158/Pashanim...,Airwaves,Pashanim,"(Stickle)\nEy, ey, ey\n\nHinterhof im Schatten..."
6,6,https://www.lyrics.com/lyric/37464362/Pashanim...,Hauseingang,Pashanim,"Ey, Pasha, Stickle\nEy, ey, ey\n\nEy, yo, ich ..."
7,7,https://www.lyrics.com/lyric-lf/2270056/Pashan...,HENTAI,Pashanim,"Brra\n\nBEK, BEK (ey, ja), Digga\nKasimir und ..."
8,8,https://www.lyrics.com/lyric-lf/2472400/Pashan...,sportback,Pashanim,"Ja, ahh, yeah-ah, ja\n(Yeah, Kevin made this b..."
9,9,https://www.lyrics.com/lyric-lf/2463116/Pashan...,istanbul freestyle,Pashanim,"Stickle\n\nIch trink' Sodas aus Türkei, ich bi..."


In [14]:
df = pd.read_csv("lyrics.csv")

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,song_url,song_names,artist,songtext
0,0,https://www.lyrics.com/lyric/37464360/Yin+Kall...,Doppel K,Yin Kalle,"(Jaynbeats)\n\nKasi, Kalle, Doppel-K, kill, ki..."
1,1,https://www.lyrics.com/lyric/38188151/Yin+Kall...,44BABY,Yin Kalle,"Ah-ah-ah\nWoah, ah-ah\nIch ficke mein' Kopf we..."
2,2,https://www.lyrics.com/lyric/38401250/Yin+Kall...,Abischnitt,Yin Kalle,"(Sizzy)\n(Macloud, Miksu, pass out)\nAh\n\nRin..."
3,3,https://www.lyrics.com/lyric-lf/2766757/Yin+Ka...,LESH,Yin Kalle,"(Y-Y-You know who's on da beats)\n(Ayo, George..."
4,4,https://www.lyrics.com/lyric-lf/3334381/Yin+Ka...,Müde,Yin Kalle,"Ja, ich bin müde, brr\nJa, ich bin müde (for r..."


In [None]:
vectorizer = TfidfVectorizer(stop_words = stop_words, min_df = 0.1)
tfidf = vectorizer.fit_transfortm(songs['lyrics'])