In [8]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import time
import spacy
import string
import re

nlp = spacy.load('en_core_web_sm')

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [9]:
song_lyrics_page = requests.get('http://www.songlyrics.com/musicgenres.php')
html = song_lyrics_page.text
sp = bs(html)

new_genres_and_urls = [val['href'] for val in sp.find_all('a')[210:224]]
ng = [val['href'] for val in sp.find_all('a')[191:207:3]]
ng = [ng[5],ng[4],ng[1],ng[2],ng[3],ng[0]]
genre_names = ["All Time", "Tens", "Aughts","Nineties","Eighties","Seventies","Sixties","Fifties","Rock","R&B","Country","Hip Hop/Rap","Pop","Christian"]

In [10]:
def point_mixer(ng, new_genres_and_urls):
    mixed_points = []
    offset = 0
    for ind, val in enumerate(new_genres_and_urls):
        if val[0]=='h':
            mixed_points.append(val)
            offset += 1    
        else:
            mixed_points.append(ng[ind-offset])
    return mixed_points

mixed_points = point_mixer(ng,new_genres_and_urls)

def genre_tuples_maker(genre_names, genre_urls):
    genre_tuple_list=[]
    for ind,name in enumerate(genre_names):
        genre_tuple_list.append((genre_names[ind],genre_urls[ind]))
    return genre_tuple_list

genre_tuple_list = genre_tuples_maker(genre_names, mixed_points)

In [11]:
cleaned_list = []
for val in genre_tuple_list:
    if val[1][0]=='h':
        cleaned_list.append(val)
    else:
        cleaned_list.append((val[0],'http://songlyrics.com/news/top-genres'+val[1]))
cleaned_list

[('All Time', 'http://www.songlyrics.com/news/top-songs/all-time/'),
 ('Tens', 'http://www.songlyrics.com/news/top-songs/2011/'),
 ('Aughts', 'http://www.songlyrics.com/news/top-songs/2000/'),
 ('Nineties', 'http://www.songlyrics.com/news/top-songs/1990/'),
 ('Eighties', 'http://www.songlyrics.com/news/top-songs/1980/'),
 ('Seventies', 'http://www.songlyrics.com/news/top-songs/1970/'),
 ('Sixties', 'http://www.songlyrics.com/news/top-songs/1960/'),
 ('Fifties', 'http://www.songlyrics.com/news/top-songs/1950/'),
 ('Rock', 'http://www.songlyrics.com/news/top-genres/rock/'),
 ('R&B', 'http://www.songlyrics.com/news/top-genres/rhythm-blues/'),
 ('Country', 'http://www.songlyrics.com/news/top-genres/country-music/'),
 ('Hip Hop/Rap', 'http://www.songlyrics.com/news/top-genres/hip-hop-rap/'),
 ('Pop', 'http://www.songlyrics.com/news/top-genres/pop/'),
 ('Christian', 'http://www.songlyrics.com/news/top-genres/christian/')]

In [13]:
def song_name_scraper(genre_tuple_list):
    '''takes an input of genres from songlyrics.com and returns a list of top songs from those genres'''
    url_list = [val for _,val in genre_tuple_list]
    song_page_dict = {}
    for ind, url in enumerate(url_list):
        song_page_dict[genre_tuple_list[ind][0]] = []
        response = requests.get(url) 
        time.sleep(3)
        page_html = response.text
        soup = bs(page_html)
        if ind <= 7:
            for val in soup.find_all('a')[93:293:2]:
                song_page_dict[genre_tuple_list[ind][0]].append(val.get('href'))
        else:            
            for val in soup.find_all('a')[92:292:2]:
                song_page_dict[genre_tuple_list[ind][0]].append(val.get('href'))
    return song_page_dict  

page_dict = song_name_scraper(genre_tuple_list)

def lyric_dictionary_creator(page_dict):
    corpus_dict = {}
    for genre in page_dict.keys():
        corpus_dict[genre] = []
        for url in page_dict[genre]:
            if url == '':
                continue
            else:
                response = requests.get(url)
                time.sleep(.5)
                page_html = response.text
                soup = bs(page_html)
                if isinstance(soup.find(id='songLyricsDiv'), type(None)):
                    continue
                else:
                    corpus_dict[genre].append(soup.find(id='songLyricsDiv').text)
    return corpus_dict

corpus_dict = lyric_dictionary_creator(page_dict)

In [183]:
punc = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words = [val for val in stop_words]
stop_words.append('lyrics')
stop_words.append('...')
parser = English()

def spacy_tokenizer(lyrics):
    """code adapted from that found at dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/"""
    doc = parser(lyrics)
    doc = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in doc]
    doc = [word for word in doc if word not in stop_words and word not in punc]
    return doc

In [210]:
cv = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, min_df = 3, max_df = 30)

def cv_dtm_maker(corpus_dict):
    cv_dtm_dict = corpus_dict.copy()
    for genre in cv_dtm_dict.keys():
        cv_dtm_dict[genre] = cv.fit_transform(cv_dtm_dict[genre])
        cv_dtm_dict[genre] = pd.DataFrame(cv_dtm_dict[genre].toarray(),columns=cv.get_feature_names())
    return cv_dtm_dict

def tfidf_dtm_maker(corpus_dict):
    tfidf_dtm_dict = corpus_dict.copy()
    for genre in tfidf_dtm_dict.keys():
        tfidf_dtm_dict[genre] = tfidf.fit_transform(tfidf_dtm_dict[genre])
        tfidf_dtm_dict[genre] = pd.DataFrame(tfidf_dtm_dict[genre].toarray(),columns=tfidf.get_feature_names())
    return tfidf_dtm_dict

def spacy_dict_maker(corpus_dict):
    spacy_dict = {}
    for genre in corpus_dict.keys():
        spacy_dict[genre] = []
        for lyric in corpus_dict[genre]:
            doc = nlp(lyric)
            spacy_dict[genre].append(doc)
    return spacy_dict

In [211]:
nmf = NMF(n_components = 2, random_state = 321)

def nmf_generator(dtm_dict):
    nmf_dict = {}
    stat_dict = {}
    for genre in dtm_dict.keys():
        tfidf.fit_transform(corpus_dict[genre])
        nmf_dict[genre] = nmf.fit(dtm_dict[genre])
        stat_dict[genre] = pd.DataFrame(nmf.components_, columns=tfidf.get_feature_names())
    return nmf_dict, stat_dict

In [212]:
test = tfidf_dtm_maker(corpus_dict)

In [213]:
test2, test3 = nmf_generator(test)



In [164]:
test3['All Time']

Unnamed: 0,'em,...,afraid,ah,away,babe,bad,beat,bed,believe,....1,west,whoa,wind,wonder,word,words,world,wrong,yes,young
0,0.044864,0.066238,0.03232,0.022116,0.138116,0.037713,0.12369,0.040738,0.06181,0.082362,...,0.029923,0.056411,0.0887,0.131935,0.040253,0.058073,0.144093,0.118783,0.132922,0.065667
1,0.001864,0.0,0.0,0.015359,0.0,0.0,0.0,0.020297,0.0,0.0,...,0.0,5.7e-05,0.0,0.0,0.002354,0.0,0.00109,0.0,0.0,0.002002


In [153]:
def topic_printer(nmf_comp):
    """code adapted from that found at predictivehacks.com/topic-modeling-with-nmf-in-python"""
    for df in nmf_comp.keys():
        for topic in range(nmf_comp[df].shape[0]):
            inter = nmf_comp[df].iloc[topic]
            print(f'For topic {topic+1} in genre {df} the words with the highest values are:')
            print(inter.nlargest(10))
            print('\n')

In [214]:
topic_printer(test3)

For topic 1 in genre All Time the words with the highest values are:
day           0.387072
said          0.331121
night         0.289404
walk          0.266164
let           0.256559
time          0.255443
long          0.215201
girl          0.214293
little        0.195228
california    0.175921
Name: 0, dtype: float64


For topic 2 in genre All Time the words with the highest values are:
hey       0.563697
good      0.552495
feel      0.290631
wanna     0.277471
right     0.227063
miss      0.159143
let       0.146592
wrong     0.143043
hold      0.123902
better    0.118821
Name: 1, dtype: float64


For topic 1 in genre Tens the words with the highest values are:
night    0.233167
thing    0.230585
feel     0.213853
good     0.208765
rain     0.204455
long     0.199950
tell     0.195172
body     0.193545
look     0.190338
hold     0.188297
Name: 0, dtype: float64


For topic 2 in genre Tens the words with the highest values are:
tonight    0.712927
run        0.639988
la         0.2

In [194]:
corpus_dict['Aughts']

["I can feel the magic floating in the air\nBeing with you get's me that way\nI watch the sunlight dance across your face\nAnd I've never been this swept away\n\nAll my thoughts just seem to settle on the breeze\nWhen I'm lying wrapped up in your arms\nThe whole world just fades away\nThe only thing I hear\nIs the beating of your heart\n\n'Cause I can feel you breathe\nIt's washing over me\nAnd suddenly I'm melting into you\nThere's nothing left to prove\nBaby, all we need is just to be\nCaught up in the touch\nSlow and steady rush\nBaby, isn't that the way that love's suppose to be\nI can feel you breathe, just breathe\n\nIn a way I know my heart is waking up\nAs all the walls come tumbling down\nCloser than I've ever felt before\nAnd I know, and you know\nThere's no need for words right now\n\nI can feel you breathe\nWashing over me\nAnd suddenly I'm melting into you\nThere's nothing left to prove\nBaby, all we need is just to be\nCaught up in the touch\nSlow and steady rush\nBaby, i