In [10]:
import os
import re
import pandas as pd

import requests
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB

## 1. Download all song lyrics of the artists

In [11]:
def create_artist_file(filename, url):    
    """
    get the page of the artist and save the HTML content to a file
    """
    req = requests.get(url)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(req.text)

In [12]:
def get_songs_list(filename):
    """
    opens the file of an artist and extracts the 1st 400 song links and adds them to a list
    returns that list
    """
    # open the artist file
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    # extract all song links from the file. 
    pattern = 'href="/lyric/([0-9]+.+?)"'
    links_dirty = re.findall(pattern , text)
    # add the correct start of each link to the first 400 links and return them in a list
    return ['https://www.lyrics.com/lyric/' + text for text in links_dirty[:400]]

In [13]:
def create_directory(root_dir, artist):
    """
    creates a directory with the artist name in the root directory, if the directory doesn't yet exists
    """
    directory = f"{root_dir}{artist}/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

In [14]:
def get_song_text(links, directory, artist):
    """
    opens the links from the argument list and browses the page for title and lyrics.
    saves each song in a file with a cleaned title with the lyrics as content
    jumps over songs for which a file has already been created (criteria: title of the song)
    """
    songs_included = set()
    counter = 0
    for url in links:
        # get the content of the URL
        req_song_html = requests.get(url=url).text 
        song_soup = BeautifulSoup(markup=req_song_html, features='html.parser')
        # get the title and lyrics
        title = song_soup.find(name='h1', attrs={'class':'lyric-title'})
        lyrics = song_soup.find(name='pre', attrs={'class':'lyric-body'})
        # if title or lyrics are None just continue with next url.
        # else save file with lyrics into the artist's folder
        if title is not None and lyrics is not None:
            title = title.text.replace('/', '_')
            title = title.lower()
            lyrics = lyrics.text
            # check if song has already been downloaded. if not create a file
            if f"{artist}{title}" not in songs_included:
                # add song to the song set (for duplicate checking)
                songs_included.add(f"{artist}{title}")        
                # write lyrics to file
                filename = f"{artist}_{counter}_{title}.txt"
                with open(os.path.join(directory, filename), 'w') as file:
                    file.write(lyrics)
                counter += 1

In [15]:
def get_lyrics_list(root_dir, artist_list):
    """
    returns 2D list: 1. Column: lyrics 2. Column: artist
    iterates over the files in each artist folder to extract the lyrics
    """
    master_list = []
    for artist in artist_list:
        for file_name in os.listdir(f"{root_dir}{artist}"):
            text = open(f"{root_dir}{artist}/{file_name}").read()
            # replacing \n in the text with whitespace
            text = text.replace('\n', ' ')
            text = text.lower()
            master_list.append([text, artist])
    return master_list


In [16]:
def create_dataframe(data):
    """
    creates a Dataframe out of 2D-list with 1. column: lyrics, 2nd column: artist and returns it
    """
    return pd.DataFrame(data, columns=['lyrics_X', 'artist_y'])
    

In [20]:
def create_tfidf_vectorizer(df_text):
    '''returns the matrix with tfidf values and the vectorizer itself'''
    tv = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
    vectorized_tfidf = tv.fit_transform(df_text['lyrics_X'])
    return vectorized_tfidf, tv

## 2. Create all song files

In [18]:
# get all lyrics in seperate files
# main url list for the 7 artists
URL = [ 'https://www.lyrics.com/artist/Johnny-Cash/1548',
        'https://www.lyrics.com/artist/Madonna/64565',
        'https://www.lyrics.com/artist/Eminem/347307',
        'https://www.lyrics.com/artist/Amy-Winehouse/612371',
        'https://www.lyrics.com/artist/The-Kooks/762797',
        'https://www.lyrics.com/artist/Frank-Sinatra/3150',
         'https://www.lyrics.com/artist/Bob-Marley/2907'] * True

# directory where to save the files
root_dir = '../data/songs3/'
for url in URL:
    # extract the artist name from the url-string
    pattern = 'st\/(.*)\/'
    artist = str(re.findall(pattern, url)[0]).replace('-','_')
    # create a file per artist with the content of the url
    filename = f"{artist}.txt"
    create_artist_file(filename, url)
    # create a list with the first 400 links from the artist file
    link_list = get_songs_list(filename)
    # create files containing the lyrics for each song of the artist
    directory = create_directory(root_dir, artist)
    get_song_text(link_list, directory, artist)


## 3. Build models to predict the artist
(NOT NEEDED ANYMORE SEE FILE 4_5_lyrics_pipeline.ipnyb)

In [21]:
# get all directory-names (artist-names)
artist_list = os.listdir(root_dir)
# get a 2D list. axis 1: list with Lyrics & artist for each song. axis 0: all songs of all artists
artist_lyrics_list = get_lyrics_list(root_dir, artist_list)
df_text = create_dataframe(artist_lyrics_list)
# instantiate and apply tfidf-vectorizer on the lyrics data
vectorized_tfidf, tv = create_tfidf_vectorizer(df_text)
# create the target labels
labels = df_text['artist_y']
df_text.head(5)

Unnamed: 0,lyrics_X,artist_y
0,"(dre) oh, so i'm out the game, huh (eminem) yo...",Eminem
1,i can feel the heat rising everything is on fi...,Eminem
2,"i roll over and go to reach for you, you're go...",Eminem
3,"you sit there stone-faced, as if i'm not here ...",Eminem
4,"'cause sometimes you just feel tired, you feel...",Eminem
...,...,...
1147,how i love the kisses of dolores aye-aye-aye d...,Frank_Sinatra
1148,"adeste fidelis laeti triumphantes venite, ven...",Frank_Sinatra
1149,(they've got an awful lot of coffe down in bra...,Frank_Sinatra
1150,everybody's going out and having fun i'm just...,Frank_Sinatra


In [22]:
# Train the random forest classifier
mod_RFC = RandomForestClassifier(max_depth=20)
mod_RFC.fit(vectorized_tfidf, labels)
print(mod_RFC.score(vectorized_tfidf, labels))

0.9331597222222222


In [23]:
# Train the Naive Bayes Classifier
mod_NB = MultinomialNB()
mod_NB.fit(vectorized_tfidf, labels)
print(mod_NB.score(vectorized_tfidf, labels))

0.8645833333333334


## 4. Test the models with lyrics

In [24]:
# create Testdata
corpus_test = [['oh, and your sweet and pretty face in such an ugly way something so beautiful', 'x'], 
                ['this is my invitation i got the special vacation i need your concentration just to feel your vibration At that soul shakedown party tonight we gonna have a soul shakedown party tonight!', 'x'],
                ['you know that we are living in a material world and i am a material girl', 'x'],
                ['I fell into a burning ring of fire I went down down down the flames they went higher and it burns burns burns', 'x']]
df_test = create_dataframe(corpus_test)
# transform the lyrics into a tfidf-vector
vec_test = tv.transform(df_test['lyrics_X'])
# print the prediction result
print(f"RFC - classes: {mod_RFC.classes_}")
print(f"RFC - predictions: {mod_RFC.predict(vec_test)}")
print(f"RFC: {mod_RFC.predict_proba(vec_test)}")
print(f"NB - classes: {mod_NB.classes_}")
print(f"NB - predictions: {mod_NB.predict(vec_test)}")
print(f"NB: {mod_NB.predict_proba(vec_test)}")

RFC - classes: ['Amy_Winehouse' 'Bob_Marley' 'Eminem' 'Frank_Sinatra' 'Johnny_Cash'
 'Madonna' 'The_Kooks']
RFC - predictions: ['Frank_Sinatra' 'Frank_Sinatra' 'Frank_Sinatra' 'Frank_Sinatra']
RFC: [[0.07364347 0.12305967 0.01462897 0.28630232 0.18965349 0.12524678
  0.1874653 ]
 [0.07801903 0.16659704 0.03173424 0.22267863 0.19489304 0.15567087
  0.15040716]
 [0.08048353 0.10014176 0.0259184  0.27945081 0.19828176 0.14772064
  0.16800309]
 [0.08051944 0.11097858 0.01757613 0.3260986  0.22780244 0.13220462
  0.10482018]]
NB - classes: ['Amy_Winehouse' 'Bob_Marley' 'Eminem' 'Frank_Sinatra' 'Johnny_Cash'
 'Madonna' 'The_Kooks']
NB - predictions: ['The_Kooks' 'Madonna' 'Madonna' 'Johnny_Cash']
NB: [[0.08637549 0.05641348 0.09540077 0.18140543 0.1533026  0.14658999
  0.28051223]
 [0.10755523 0.19503093 0.12555465 0.14680958 0.12956719 0.21485101
  0.08063142]
 [0.06871083 0.06896274 0.09881563 0.1189619  0.11377432 0.42974993
  0.10102465]
 [0.09375014 0.07040193 0.1440173  0.16749266 0.29