# imports and functions

In [35]:
import pandas as pd
from os import chdir
import re
import requests
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
from bs4 import element
from tqdm import tqdm

In [43]:

#Helps parse miscellaneous tags <i>, </br>, etc,.
def _lyricsHelper(html, lyrics_list):
    for tag in html.childGenerator():
        if type(tag) == element.NavigableString:
            _handleLyricAppend(lyrics_list, tag.text.strip())
        elif tag.name == 'br' and lyrics_list[len(lyrics_list) - 1] != '':
            lyrics_list.append('')
        elif html.name == 'a':
            _lyricsHelper(tag, lyrics_list)

#Reads the HTML for lyrics dividers (if they exist) and appends the lyrics line by line to a list
def _getLyricsFromHTML(html):
    lyrics = html.findAll("div", {"data-lyrics-container" : "true"})
    lyrics_list = ['']
    for segment in lyrics:
        for a in segment.childGenerator():
            lyric = None
            if type(a) == element.NavigableString:
                lyric = a.strip()
                _handleLyricAppend(lyrics_list, lyric)
            else:
                _lyricsHelper(a, lyrics_list)
            if a.name == 'br' and lyrics_list[len(lyrics_list) - 1] != '':
                lyrics_list.append('')
    return lyrics_list

#Helper function to handle appending and manipulating lyrics_list. A new line is generated only for </br> tags
def _handleLyricAppend(lyrics_list, lyric):
    if lyric is not None:
        last_index = len(lyrics_list) - 1
        #Handle special cases (parenthesis and symbols stick with words for instance)
        if lyrics_list[last_index] != '' and (lyrics_list[last_index][-1] in ['(','[','{','<'] or lyric in [')',']','}','>','!','?',',','.']):
            lyrics_list[last_index] += lyric
        else:
            lyrics_list[last_index] += " " + lyric if lyrics_list[last_index] != '' else lyric

#Determines whether a song will need to be translated (returns the link if it does, otherwise returns None)
def _getSongTranslationLink(html):
    translation_tags = html.findAll('a', class_='TextButton-sc-192nsqv-0 hVAZmF LyricsControls__TextButton-sghmdv-6 hXTHjZ')
    for tag in translation_tags:
        if "english-translations" in tag['href']:
            return tag['href']
    return None

#Determines whether a page exists
def _pageExists(html):
    return html.find('div', class_='render_404') == None
        
#function to scrape lyrics from genius, takes an array of artists, and songname
def scrapeLyrics(artistnames, songname):
    lyrics_list = []
    found = False
    i = 0
    html = None
    while i < len(artistnames) and not(found):
        artistname = artistnames[i]
        artistname2 = str(artistname.replace(' ','-')) if ' ' in artistname else str(artistname)
        songname2 = str(songname.replace(' ','-')) if ' ' in songname else str(songname)
        page_url = 'https://genius.com/'+ artistname2 + '-' + songname2 + '-' + 'lyrics'
        page = requests.get(page_url)
        html = BeautifulSoup(page.text, 'html.parser') 
        found = _pageExists(html)
        i += 1
    if found:
        #check if there is an english translation
        translation_url = _getSongTranslationLink(html)
        if translation_url is not None:
            print('AAAA')
            page = requests.get(translation_url)
            html = BeautifulSoup(page.text, 'html.parser') 
            lyrics_list = _getLyricsFromHTML(html)
        else:
            #If there isn't a translation, make sure it's in english in the first place
            english = False
            for script in html.findAll('script'):
                if "language\\\":\\\"en" in str(script):
                    english = True
            if english: lyrics_list = _getLyricsFromHTML(html)
    return lyrics_list
    


In [33]:
def getTitlesAndArtists(sp, track_ids):
    titleArtistPairs = {}
    for i in range(0,len(track_ids),50):
        tracks = sp.tracks(track_ids[i:i+50])
        for track in tracks['tracks']:
            title=track['name']
            #check if the track ends with (feat. artist) using a regex
            if re.search(r' \(feat. .*\)$', title):
                #remove the (feat. artist) from the title
                title = re.sub(r' \(feat. .*\)$', '', title)

            artists=[]
            for artist in track['artists']:
                artists.append(artist['name'])
            titleArtistPairs[track['id']] = {'title':title,'artist(s)':artists}

    return titleArtistPairs

def getScrapedLyrics(scraperInputs):
        all_lyrics_dict = {}
        for id, songInfo in tqdm(scraperInputs.items()):
                #maybe add a sleep or something to prevent getting blocked
                lyrics = scrapeLyrics(songInfo['artist(s)'],songInfo['title'])
                if len(lyrics) > 0:
                        all_lyrics_dict[id]=lyrics
                        
        return all_lyrics_dict

# retrieving songs

In [12]:
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials('bf1ba68423404778a60bcf3dee58d199','7365dc611a2d4ddba4ad61343f0b64d7'))


In [14]:
chdir('C:/Users/mlar5/OneDrive/Desktop/Code Folder/Python Projects/IRL projects/Aspire - Affective Computing Project/Playlists Data/Audio Features/emotion joint data')

In [15]:
emotionsDF = pd.read_csv('Merged Emotions Data4.csv')

In [20]:
#get the unique mood ids
mood_ids = emotionsDF['mood'].unique()

In [None]:
# create a new df with only up to 500 songs per mood
# this is to balance the data

balancedDF = pd.DataFrame(columns=emotionsDF.columns)

for i in mood_ids:
    df = emotionsDF[emotionsDF['mood']==i]
    #if the value count of the mood is larger than 500, sample 500
    if df['mood'].value_counts()[i] > 100:
        df = df.sample(n=100, random_state=42)
    #if the value count of the mood is less than 500, sample the value count
    else:
        df = df.sample(n=df['mood'].value_counts()[i])
    balancedDF = pd.concat([balancedDF, df])

balancedDF['mood'].value_counts()

In [27]:
ids_dict = {}
#iterate through balancedDF and get the list of ids, using the mood as the key
for i in mood_ids:
    ids_dict[i] = balancedDF[balancedDF['mood']==i]['uri'].tolist()


In [29]:
for key in ids_dict.keys():
    ids_dict[key] = getTitlesAndArtists(sp, ids_dict[key])

In [30]:
ids_dict['sad']

{'419w8oZaqevgyZfdvFZEHa': {'title': 'Te Pintaron Pajaritos',
  'artist(s)': ['Yandar & Yostin', 'Andy Rivera']},
 '5rKvVyqtNT1EvrNuF9tLoF': {'title': "Llamé Pa' Verte",
  'artist(s)': ['Nio Garcia', 'Darell']},
 '5qFeMRz4TvetPmzjre9Dq7': {'title': 'No Es Justo',
  'artist(s)': ['J Balvin', 'Zion & Lennox']},
 '79aj4SR6l5FKOJqy1TdtSv': {'title': 'Fall In Love - Acoustic',
  'artist(s)': ['Bailey Zimmerman']},
 '6AzKhCHOms83jvNVLsz0Bt': {'title': 'Djadja (feat. Maluma) - Remix',
  'artist(s)': ['Aya Nakamura', 'Maluma']},
 '0QN1rkBy3D6j9Yw3Czlcsc': {'title': 'Ella Quiere Hmm... Haa... Hmm... - Yayo Remix',
  'artist(s)': ['Leka el Poeta', 'Mishelle Master Boys', 'DJ Yayo']},
 '4epozEKOwPtszj2zaKeVIP': {'title': 'with you', 'artist(s)': ['The Rose']},
 '2Q7KYjCv7XaLc0KRGY9m50': {'title': 'Bonita', 'artist(s)': ['Jeeiph']},
 '2BJy4svtrGACqRB5BFLOK6': {'title': 'Way of the Triune God - Hallelujah Version',
  'artist(s)': ['Tyler Childers']},
 '2bPEmMFmuV7Szif2UmK1t8': {'title': 'Congratula

In [36]:
for key in ids_dict.keys():
    ids_dict[key] = getScrapedLyrics(ids_dict[key])

100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:40<00:00,  2.48it/s]
100%|██████████| 100/100 [00:58<00:00,  1.72it/s]
100%|██████████| 100/100 [01:01<00:00,  1.61it/s]
100%|██████████| 100/100 [00:52<00:00,  1.90it/s]
100%|██████████| 100/100 [01:58<00:00,  1.18s/it]
100%|██████████| 100/100 [01:08<00:00,  1.46it/s]
100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


In [37]:
ids_dict

{'angry': {'1SSv8SA2OHfOUwLgb8yOum': ['[Chorus]',
   'Catch me on your block',
   "Always mobbin' with my niggas",
   "Countin' hunnids up to the light",
   'Hunnid niggas in the ceiling, yeah',
   'Bout the fuckin guap!',
   "I don't think they even get it",
   "Pockets stacked up like it's lunch",
   'Pull up nev-',
   '[Intro]',
   'Weh the bloodclat do you, siddung?',
   '(Tokyo Ghoul intro)',
   '[Verse 1]',
   'Cheated on the code, I can see it clearly now (Yeah)',
   "Thought you would've known bout how it flows out of my mouth (Yeah)",
   "They don't wanna see me when my jeans got on the pouch (Yeah)",
   'Baby wants to snort from a nigga out of town',
   '[Chorus]',
   'Catch me on your block',
   "Always mobbin' with my niggas",
   "Countin' hunnids up to the light",
   'Hunnid niggas in the ceiling, yeah',
   'Bout the fuckin guap!',
   "I don't think they even get it",
   "Pockets stacked up like it's lunch",
   "Pull up never causin' racket, fight",
   '[Verse 2]',
   'Dam

In [None]:
chdir('C:/Users/mlar5/OneDrive/Desktop/Code Folder/Python Projects/IRL projects/Aspire - Affective Computing Project/Playlists Data/Track Lyrics/')

In [40]:
mood_ids

array(['angry', 'calm', 'content', 'depressed', 'energetic', 'excited',
       'happy', 'sad'], dtype=object)

In [41]:
ids_dict['sad']

{'5qFeMRz4TvetPmzjre9Dq7': ['[Letra de "No Es Justo"]',
  '[Intro: Zion]',
  '(Oh yeah)',
  'Woh-oh-oh',
  '(Eh-yeh-yeh, yeh-yeh-yeh, yeh)',
  '[Coro: J Balvin, Zion & Lennox]',
  'Cuando empiezas a bailar, no es justo, no es justo',
  'Y lo noto en tu mirar, te gusto, te gusto (Baby)',
  'Sonando esta canción y yo viéndote',
  'Si te acercas a mí, no pares (Pares; ah)',
  'Y si te digo "Está linda" una y otra vez',
  'Eso te gusta y lo sabes (Yah, yah, yah)',
  'Cuando empiezas a bailar (Sup, sup)',
  'No es justo, no es justo (Mamacita ; eh-yeah)',
  'Y lo noto en tu mirar , te gusto, te gusto (Sup; dile Z)',
  'Sonando esta canción y yo viéndote',
  'Si te acercas a mí, no pares',
  'Y si te digo, "Está\' linda" una y otra vez (Wow-wow-wow-wow)',
  'Eso te gusta y lo sabes',
  '[Verso 1: Lennox]',
  'Sin ti no existe la discoteca (Discoteca)',
  'Ya te he dado mucho y poco tú me das (Dame más; yeah)',
  'De la fiesta eres la pieza que falta (Sí, sí)',
  "Si pa' pasarla rico sólo fal

In [44]:
spanish_song = getTitlesAndArtists(sp, ['5qFeMRz4TvetPmzjre9Dq7'])

In [45]:
getScrapedLyrics(spanish_song)

100%|██████████| 1/1 [00:00<00:00,  3.56it/s]


{'5qFeMRz4TvetPmzjre9Dq7': ['[Letra de "No Es Justo"]',
  '[Intro: Zion]',
  '(Oh yeah)',
  'Woh-oh-oh',
  '(Eh-yeh-yeh, yeh-yeh-yeh, yeh)',
  '[Coro: J Balvin, Zion & Lennox]',
  'Cuando empiezas a bailar, no es justo, no es justo',
  'Y lo noto en tu mirar, te gusto, te gusto (Baby)',
  'Sonando esta canción y yo viéndote',
  'Si te acercas a mí, no pares (Pares; ah)',
  'Y si te digo "Está linda" una y otra vez',
  'Eso te gusta y lo sabes (Yah, yah, yah)',
  'Cuando empiezas a bailar (Sup, sup)',
  'No es justo, no es justo (Mamacita ; eh-yeah)',
  'Y lo noto en tu mirar , te gusto, te gusto (Sup; dile Z)',
  'Sonando esta canción y yo viéndote',
  'Si te acercas a mí, no pares',
  'Y si te digo, "Está\' linda" una y otra vez (Wow-wow-wow-wow)',
  'Eso te gusta y lo sabes',
  '[Verso 1: Lennox]',
  'Sin ti no existe la discoteca (Discoteca)',
  'Ya te he dado mucho y poco tú me das (Dame más; yeah)',
  'De la fiesta eres la pieza que falta (Sí, sí)',
  "Si pa' pasarla rico sólo fal