In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os, os.path
import pandas as pd
import numpy as np

import nltk
import nltk.data
from nltk.tokenize import TreebankWordTokenizer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


In [2]:
def get_url(artist):
    """
    Get the url link for the artist in www.lyrics.com 
    """
    return f"https://www.lyrics.com/artist/{artist}"
    
kl = get_url('Kendrick-Lamar')
sia = get_url('Sia')


In [3]:
kl

'https://www.lyrics.com/artist/Kendrick-Lamar'

In [4]:
def get_html(url):
    """
    """
    response = requests.get(url)
    return response.text

kl_html = get_html(kl)
    

In [5]:
def get_beautiful_soup(url, artist_name):
    """
    """
    file_name = f"{artist_name}_html"
    with open (file_name, "w") as f:
        f.write(get_html(url))

    with open(file_name) as f:
        return BeautifulSoup(f, 'html.parser')


In [6]:
soup_kl = get_beautiful_soup(kl, 'Kendrick_Lamar')
soup_sia = get_beautiful_soup(sia, 'Sia')


In [7]:
def song_names(soup):
    """
    Get the names of the songs from the 
    beautiful soup object of the artist
    """
    return [t.text for t in soup.find_all(class_ = 'tal qx')]

In [8]:
song_names(soup_kl)

['Hair Down',
 'All the Stars',
 'Rearview',
 'Momma I Hit a Lick',
 'Nile',
 'Humble',
 'HUMBLE.',
 'All the Stars',
 'Black Panther',
 'All the Stars',
 "King's Dead",
 'Big Shot',
 'Pray for Me',
 'X',
 'Black Panther',
 'All the Stars',
 'X',
 "King's Dead",
 'Big Shot',
 'Pray for Me',
 'The Mantra',
 'The Mantra',
 'All the Stars',
 'Pray for Me',
 'Dedication',
 'Hustla’s Story',
 "King's Dead",
 'All the Stars',
 'The Recipe',
 'Element',
 'Compton',
 'Power',
 'These Walls',
 'FEEL.',
 'Humble',
 'Pray for Me',
 'Tints',
 'Something Dirty/Pic Got Us',
 'Pray for Me',
 'Blood',
 'Pride',
 "King's Dead",
 'Wow Freestyle',
 'Love',
 'Mona Lisa',
 'DNA.',
 'Goosebumps',
 'New Freezer',
 'Love',
 'Dedication',
 "Don't Wanna Know",
 "Don't Wanna Know",
 'Humble',
 "Don't Wanna Know",
 'DNA.',
 'HUMBLE.',
 'DNA.',
 'Yeah Right',
 'Power',
 'Yeah Right',
 "Don't Wanna Know",
 'LOVE.',
 "Don't Wanna Know",
 'Doves in the Wind',
 'BLOOD.',
 'DNA.',
 'YAH.',
 'ELEMENT.',
 'FEEL.',
 'LOYA

In [9]:
def get_links(soup, artist):
    """Get the complete link for each song of the artist"""
    links = f'links_{artist}' 
    links = []
    for td in soup.find_all('td'):
        if "tal" in td.get('class',[]):
              links.append('https://www.lyrics.com'+td.find('a')['href'])
    return links

In [10]:
links_kl = get_links(soup_kl, 'kendrick_lamar')
links_kl

['https://www.lyrics.com/lyric/36524029/Kendrick+Lamar/Hair+Down',
 'https://www.lyrics.com/lyric/35948717/Kendrick+Lamar/All+the+Stars',
 'https://www.lyrics.com/lyric/36459542/Kendrick+Lamar/Rearview',
 'https://www.lyrics.com/lyric/36036989/Kendrick+Lamar/Momma+I+Hit+a+Lick',
 'https://www.lyrics.com/lyric/36400714/Kendrick+Lamar/Nile',
 'https://www.lyrics.com/lyric/35121898/Kendrick+Lamar/Humble',
 'https://www.lyrics.com/lyric/34694292/Kendrick+Lamar/HUMBLE.',
 'https://www.lyrics.com/lyric/34719776/Kendrick+Lamar/All+the+Stars',
 'https://www.lyrics.com/lyric/34793551/Kendrick+Lamar/Black+Panther',
 'https://www.lyrics.com/lyric/34793550/Kendrick+Lamar/All+the+Stars',
 'https://www.lyrics.com/lyric/34793543/Kendrick+Lamar/King%27s+Dead',
 'https://www.lyrics.com/lyric/34793539/Kendrick+Lamar/Big+Shot',
 'https://www.lyrics.com/lyric/34793538/Kendrick+Lamar/Pray+for+Me',
 'https://www.lyrics.com/lyric/34817812/Kendrick+Lamar/X',
 'https://www.lyrics.com/lyric/34818173/Kendrick+La

In [11]:
def scrape_song_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = html.find(class_='lyric-body').text
    #remove identifiers like chorus, verse, etc
    lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
    #remove empty lines
    lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])
    
    return lyrics

In [12]:
def get_lyrics(soup, artist):  
    """ write a file with the lyrics of the artist"""
    c = 0 
    for link in get_links(soup, artist)[:5]:
        try:
            with open (f'{artist}_{song_names(soup_kl)[c].replace("/","")}.txt', 'w') as f:
                s = scrape_song_lyrics(link)
                f.write(s)
                c += 1
                print(f"Songs grabbed:{len(s)}")
        except AttributeError:  
               print(f"some exception at {link}: {c}")

get_lyrics(soup_kl,'Kendrick_Lamar')
get_lyrics(soup_sia, 'Sia')

Songs grabbed:2084
Songs grabbed:2448
Songs grabbed:934
Songs grabbed:2425
Songs grabbed:1411
Songs grabbed:1344
Songs grabbed:1260
Songs grabbed:1574
Songs grabbed:1261
Songs grabbed:2297


In [13]:
path = os.path.expanduser('/home/xrusa/Documents/euclidean-eukalyptus/work_in_progress/week_4/nltk_data/corpora')
path in nltk.data.path


False

In [14]:
row_list = []

for subdir in ['Kendrick_Lamar', 'Sia']:
    for folder, subfolders, filenames in os.walk('/home/xrusa/Documents/euclidean-eukalyptus/work_in_progress/week_4/nltk_data/corpus/'+subdir):
        for file in filenames:
            d = {'artist':subdir}  # assign the name of the subdirectory to the label field
            with open('/home/xrusa/Documents/euclidean-eukalyptus/work_in_progress/week_4/nltk_data/corpus/'+subdir+'/'+file) as f:
                if f.read():      # handles the case of empty files, which become NaN on import
                    f.seek(0)
                    d['lyrics'] = f.read()  # assign the contents of the file to the review field
            row_list.append(d)
        break


In [15]:
df = pd.DataFrame(row_list)

In [16]:
df.dropna(inplace=True)
df.isna().sum()

artist    0
lyrics    0
dtype: int64

In [17]:
df['lyrics'] = df['lyrics'].str.replace('\n', ' ')
df['lyrics'] = df['lyrics'].str.replace('\r', ' ')
df['lyrics'] = df['lyrics'].str.replace(r'[0-9]', ' ')



  df['lyrics'] = df['lyrics'].str.replace(r'[0-9]', ' ')


In [18]:
df.head(-5)

Unnamed: 0,artist,lyrics
0,Kendrick_Lamar,Wop Ear Drummers Mike WiLL Made-It Gi-gimme so...
1,Kendrick_Lamar,Waterfront Sunshine A little weed A little red...
2,Kendrick_Lamar,Lord God I come to you a sinner And I humbly r...
3,Kendrick_Lamar,"Everybody, everybody Everybody Everybody sit y..."
4,Kendrick_Lamar,shots And we'll take that ride Across the b...
...,...,...
212,Sia,"I wanted you to know That I am ready to go, he..."
213,Sia,Watching haters wonder why Gambino got the gam...
214,Sia,Miiri man ɲi! Aa! Dennin! K'an bɛn sɔgɔma! Mii...
215,Sia,Uh Someone made a mess in my account Someone ...


In [19]:
X = df['lyrics']
y = df['artist']

In [20]:
X

0      Wop Ear Drummers Mike WiLL Made-It Gi-gimme so...
1      Waterfront Sunshine A little weed A little red...
2      Lord God I come to you a sinner And I humbly r...
3      Everybody, everybody Everybody Everybody sit y...
4         shots And we'll take that ride Across the b...
                             ...                        
217    You feel like summertime You took this heart o...
218    Woo, uh, uh Yeah, yeah come on What's good?  Y...
219    Don't go, gotta know Please don't run away I'm...
220    One time I was at the beach And we were lookin...
221    Picking out my fro with my fist comb I got a t...
Name: lyrics, Length: 221, dtype: object

In [21]:
nltk.download("wordnet") 

[nltk_data] Downloading package wordnet to /home/xrusa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/xrusa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
STOPWORDS = stopwords.words('english')

In [24]:
vectorizer = TfidfVectorizer()


In [25]:
def tokenize_corpus(CORPUS):
    """tokenize and lemmatize the corpus"""
    CORPUS = [s.lower() for s in CORPUS]
    CLEAN_CORPUS = []
    tokenizer = TreebankWordTokenizer()
    lemmatizer = WordNetLemmatizer()
    for doc in CORPUS:
        tokens = tokenizer.tokenize(text=doc)
        clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
        CLEAN_CORPUS.append(clean_doc)
    return CLEAN_CORPUS

In [26]:
NEW_CORPUS = tokenize_corpus(X)

In [27]:
def dummy_fun(doc):
    return doc

steps = [('tf-idf', TfidfVectorizer(stop_words=STOPWORDS, max_df=0.8, analyzer='word', tokenizer=dummy_fun)),
         ('MNB', MultinomialNB())
        ]
          

pipeline = Pipeline(steps)

In [28]:
X_train, X_test, y_train,y_test = train_test_split(NEW_CORPUS, y, random_state=0)

In [29]:
X_test

["yeah nigga , it 's me again i 'm still around , i guess i win you hear me in your sister 's room like all the time gambino girl forever on that bracelet that she bought online to dude who could n't take it , might i say , `` hasta luega '' cause i 'm taking on your city like i 'm carmen san diego i 'm smoking , i 'm on fire i am blazin ' , do n't get hurt to my white this is a concert to my black girl this is church from that nyu dorm to the emirate everyone hated me , i 'm more hip hop than you 'll ever be i 'm bustin ' , bustin ' bustin ' , bustin ' on these nigga man and nothin ' , nothin ' nothin ' , nothin ' is the same again you lookin dumb a hell man for ever sleeepin ' on him your girl is not around , she busy sleepin ' on him will he fail ? now that 's the wrong conversation terry gross on the mic , i 'm the talk of the nation , ha hey , what it do bitch ? why these other rapper do shit stupid ? while they be sleepin ' i be onto that new shit while they be sleepin ' i be ont

In [30]:
pipeline.fit(X_train, y_train)



Pipeline(steps=[('tf-idf',
                 TfidfVectorizer(max_df=0.8,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function dummy_fun at 0x7f1472278f70>)),
                ('MNB', MultinomialNB())])

In [31]:
pipeline.score(X_test, y_test)

0.6785714285714286

In [32]:
# print training set accuracy
train_acc = round(pipeline.score(X_train, y_train) * 100, 2)
print("Prediction accuracy (Training Set):", train_acc, "%\n")

# Check and print prediction accuracy and model parameters
test_acc=round(pipeline.score(X_test, y_test) * 100,2)
print("Prediction accuracy (Test Set):", test_acc, "%\n")



Prediction accuracy (Training Set): 74.55 %

Prediction accuracy (Test Set): 67.86 %

