In [1]:
#import libraries

import requests
import re
import genius_token as gt
from bs4 import BeautifulSoup
from textblob import TextBlob 
import SongLyrics_crendentials as cred
import nltk
import numpy as np
import math

In [2]:
def artist_song_page_lookup(artist_id,token,page):
    base_url = 'https://api.genius.com'
    artist_url = base_url + '/artists/{}/songs'.format(artist_id)
    headers = {'Authorization': 'Bearer ' + token}
    params = {'sort':'popularity','page': page} # the current page
    response = requests.get(artist_url, headers=headers,params=params).json()['response']['songs']
    return response

In [3]:
#looks up the artist into genius.com's API
def artist_lookup(artist_name,token):
    base_url = 'https://api.genius.com'
    #third string is the token for access
    headers = {'Authorization': 'Bearer ' + token}
    search_url = base_url + '/search'
    data = {'q':artist_name}
    response = requests.get(search_url, data=data, headers=headers).json()['response']['hits']
    return response

In [4]:
#pulls the Urls and song title out of the API's output
def song_details(response):
    urls = []
    songs = []
    artist = []
    lyrics = []
    #change this to be a dictionary
    for song in response:
        songs.append(song['title'])
        urls.append(song['url']) 
        artist.append(song['primary_artist']['name'])
    return songs,urls,artist

In [5]:
#scrapes the lyrics from the website, using the given urls
def lyric_scraper(urls):
    lyrics = []
    for url in urls:
        page = requests.get(url)
        html = BeautifulSoup(page.text, 'html.parser')
        div = html.find('div', class_=re.compile("^lyrics$|Lyrics__Root"))
        if div is not None:
            lyrics.append(lyrics_cleaner(div.get_text()))
        else:
            lyrics.append('None')    
    return lyrics

In [6]:
#pulls the verse labels out of the lyrics
def lyrics_cleaner(song):
    verse_labels_removed = re.sub("[\(\[].*?[\)\]]", "", song)
    hyperlinks_removed = re.sub(r"[0-9]+EmbedShare URLCopyEmbedCopy",'',verse_labels_removed)
    hyperlinks_removed = re.sub(r"EmbedShare URLCopyEmbedCopy",'',hyperlinks_removed)
    cleaned_song= re.sub( r"([A-Z])", r" \1",hyperlinks_removed)
    return cleaned_song

In [7]:
#runs a seniment analysis on the lyrics for the song, -1 is completely negative sentiment, 1 is completely positive    
def artist_sentiment(artist_name,token):
    songs = lyric_scraper(artist_name,token)
    sentiments = []
    for song in songs:
        song_details = []
        song_details.append(song[0])
        blob = TextBlob(song[2])
        song_details.append(blob.sentiment[0])
        sentiments.append(song_details)
    return sentiments

In [8]:
def get_all_artist_songs(artist_id,token):
    current_page = 1
    next_page = True
    songs = []
    while next_page is True:
        page_songs = (artist_song_page_lookup(artist_id,token,current_page))
        unique_songs = remastered_song_remover(page_songs,artist_id)
        songs.extend(unique_songs)
        current_page += 1
        if len(page_songs) == 0:
            next_page = False
    return songs

In [9]:
songdb = cred.client["songs"]
songcol = songdb["song"]

In [71]:
def song_lists(artist_id,token):
    response = get_all_artist_songs(artist_id,token)
    songs,urls,artist = song_details(response)
    lyrics = lyric_scraper(urls)
    songs,urls,artist,lyrics = blank_song_remover(songs,urls,artist,lyrics)
    return songs,urls,artist,lyrics

In [89]:
def remastered_song_remover(page_songs, artist_id):
    duplicate_flags = ['remix','sound track', 'live','music-video','version', 'grammys', 'mix', 'edit', 'vma', 'acoustic', 'demo', 'statement', 'radio', 'session', 'awards', 'extended', 'setlist']
    songs_with_lyrics = [song for song in page_songs if song['url'].endswith('-lyrics')]
    unique_songs = [song for song in songs_with_lyrics if not any(flag in song['url'] for flag in duplicate_flags)]
    unique_songs = [song for song in unique_songs if song['primary_artist']['id'] == artist_id]
    return unique_songs

In [77]:
def blank_song_remover(songs,urls,artist,lyrics):
    i = 0
    while i < len(lyrics):
        if lyrics[i] == 'None':
            songs.pop(i)
            urls.pop(i)
            artist.pop(i)
            lyrics.pop(i)
            i = i+1
        else:
            i = i+1
    return songs,urls,artist,lyrics

In [12]:
def stopword_remover(song):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    lyrics = song.lower()
    word_list = [word for word in lyrics.split() if word not in stop_words]
    return word_list
        
    

In [29]:
def word_counter(lyrics):
    word_dict = {}
    unique_words = stopword_remover(lyrics)
    for word in unique_words:
        if word in word_dict.keys():
            word_dict[word] = word_dict[word]+1
        else:
            word_dict[word] = 1
    return word_dict

In [30]:
def cos_sim(vector1, vector2):
    
    # Get the common characters between the two character sets
    common_characters = vector1[1].intersection(vector2[1])
    # Sum of the product of each intersection character.
    product_summation = sum(vector1[0][character] * vector2[0]                  [character] for character in common_characters)
    # Gets the length of each vector from the word2vec output.
    length = vector1[2] * vector2[2]
    # Calculates cosine similarity and rounds the value to ndigits decimal places.
    if length == 0:
        # Set value to 0 if word is empty.
        similarity = 0
    else:
        similarity = product_summation/length
    return similarity

In [31]:
def word2vec(song_lyrics):
    counted_words = word_counter(song_lyrics)
    word_set = set(counted_words)
    length = math.sqrt(sum(c*c for c in counted_words.values()))
    return counted_words,word_set,length

In [32]:
def get_similarity(lyrics,similarity_threshold,songs):
    results = []
    vector_list = [word2vec(song) for song in lyrics]
    for i in range(len(vector_list)):
        vector_1 = vector_list[i]
        for j in range(i+1,len(vector_list)):
            vector_2 = vector_list[j]
            similarity_score= cos_sim(vector_1,vector_2)
            if 1 >= similarity_score >= similarity_threshold:
                results.append([songs[i], songs[j], similarity_score,i,j])
            else:
                pass
            
    return results