# Group 8 Final Project

### Setup Imports and API Authentication

In [52]:
import requests
import pprint as pp

# Sign Up to get API key: https://developer.musixmatch.com/signup
api_key = "56037d6d758ee92cb000255101e08477"

### Retrieve Song List

In [176]:
# API Call to get list of songs with Artist Name and Song Name
api_base_url = "https://api.musixmatch.com/ws/1.1/"

def get_songs(chart_name='top', num_songs='10', country='US'):
    track_list = []
    # Maximum page size is 100 so if we want more than 100 sounds we need to calculate the number of pages we need to retrieve
    page_nums = (num_songs // 100) + 1
    for page_num in range(1,page_nums+1):
        if num_songs <= 0:
            break
        page_size = num_songs if num_songs < 100 else 100
        # API reference: https://developer.musixmatch.com/documentation/api-reference/track-chart-get
        res = requests.get(api_base_url + f"chart.tracks.get?apikey={api_key}&chart_name={chart_name}&page={page_num}&page_size={page_size}&country={country}")
        track_list += res.json()['message']['body']['track_list']
        num_songs -= 100
    return track_list

# Get songs for data set
list_of_songs = get_songs('mxmweekly', 1000, 'US')

# Check the size of our track list and the first 10 artists and songs
print(len(list_of_songs))
pp.pprint([(track['track']['artist_name'],track['track']['track_name']) for track in list_of_songs][:10])

1000
[('TheFatRat feat. Laura Brehm', 'Monody (feat. Laura Brehm)'),
 ('Adam Sandler', 'The Thanksgiving Song'),
 ('Kristen Bell feat. Agatha Lee Monn & Katie Lopez',
  'Do You Want to Build a Snowman?'),
 ('Cloverton', 'A Hallelujah Christmas'),
 ('Leonard Cohen', 'Hallelujah'),
 ('Rihanna', 'California King Bed'),
 ('Rick Astley', 'Never Gonna Give You Up'),
 ('Taylor Swift', 'Anti-Hero'),
 ('Chris Tomlin', 'How Great Is Our God'),
 ('MC L da Vinte feat. MC Gury', 'Parado No Bailão')]


### Retrieve Song Metadata

In [177]:
# NOTE: Mood API not included in free API plan :(

def get_song_mood(song):
    res = requests.get(api_base_url + f"track.lyrics.mood.get?apikey={api_key}&commontrack_id={song['track']['commontrack_id']}&track_isrc={song['track']['track_id']}")
    return res.json()

### Scrap Lyric Data

In [178]:
from bs4 import BeautifulSoup
import re


def scrape_song_lyrics(artistname, songname):
    # Format Artist Name for URL
    artistname = str(artistname.replace('feat','and'))
    artistname = str(artistname.replace('&','and'))
    artistname = str(artistname.replace(' ','-'))
    artistname = re.sub(r'[^\w\s-]', '', artistname) 
    
    # Format Song Name for URL
    songname = str(songname.replace('...','-'))
    songname = str(songname.replace(' ','-'))
    songname = re.sub(r'[^\w\s-]', '', songname) 
    
    request_url = f'https://genius.com/{artistname}-{songname}-lyrics'
    page = requests.get(request_url)
    
    # If we fail to get the song lyrics by the URl, we skip this song
    if page.status_code == 404:
        return ""
    
    # Parse lyrics from HTML
    html = BeautifulSoup(page.text, 'html.parser')
    final_lyrics = ""
    lyrics = html.find_all("div", class_="Lyrics__Container-sc-1ynbvzw-6 YYrds")
    for lyric in lyrics:
        s = lyric.get_text(separator=" ")
        
        # Removing Genuis anotations and background lyrics
        s = re.sub('\[[^\]]+\] | \([^\)]+\)', '', s)
        
        # Concatonating lyrics into one string
        final_lyrics += s + " "
    return final_lyrics

# Check Scrape Song Lyrics - Print first 50 characters of the following song
lyrics = scrape_song_lyrics("Zach Bryan", "The Good I'll Do")
pp.pprint(lyrics[:50]+"...")

"Well in You The good I'll do The good I'll do Oh, ..."


### Collect and Merge Data

In [182]:
songs_with_metadata = []
for song in list_of_songs:
    
    # mood data locked behind paywall :(
    # mood = get_song_mood(song)
    
    lyrics = scrape_song_lyrics(song['track']['artist_name'], song['track']['track_name'])
    # Just to keep track of progress
    print("-", end = ' ')
    
    # Add mood + lyrics to dataframe
    song['track']['lyrics'] = lyrics
    songs_with_metadata.append(song)

# Print First 5 songs with artist name, song name and first 50 chars of lyrics
print("\n")
pp.pprint([(track['track']['artist_name'], track['track']['track_name'], track['track']['lyrics'][:50]+"...") for track in songs_with_metadata if track['track']['lyrics']][:5])

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

### Process Data

In [185]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
import string

word_count_by_genre = defaultdict(lambda: defaultdict(lambda:0)) #defaultdict set to 0

def get_genre(track):
    if 'primary_genres' in track and 'music_genre_list' in track['primary_genres'] and len(track['primary_genres']['music_genre_list']) > 0:
        return track['primary_genres']['music_genre_list'][0]['music_genre']['music_genre_name']
    else:
        return "Unclassified"
for song in songs_with_metadata:
    if not song['track']['lyrics']:
        # Skip songs we didn't get lyrics for
        continue
    
    # Tokenize lyrics and remove punctuation
    tokenized_lyrics = word_tokenize(song['track']['lyrics'])
    genre = get_genre(song['track'])
    for word in tokenized_lyrics:
        if word in string.punctuation:
            # Remove any remaining punctuation from the count
            continue
        
        word_count_by_genre[genre][word.lower()] += 1
        # word_count_by_mood[song['popularity']][word] += 1
        
# Diction where a Genre points to a list of word/count tuples sorted by the highest count
sorted_count_by_genre = {k1: sorted(v1.items(), key=lambda x:x[1], reverse=True) for k1, v1 in word_count_by_genre.items()}

# Check top 5 words in each genre
pp.pprint({genre: word_tuples[:5] for genre, word_tuples in sorted_count_by_genre.items()})

{'Adult Alternative': [('love', 22),
                       ('i', 21),
                       ('another', 20),
                       ('my', 16),
                       ('up', 15)],
 'Adult Contemporary': [('you', 177),
                        ('the', 128),
                        ('i', 98),
                        ('and', 76),
                        ('a', 68)],
 'Alternative': [('i', 652),
                 ('you', 541),
                 ('the', 385),
                 ('to', 251),
                 ('and', 249)],
 'Alternative Folk': [('i', 26),
                      ('you', 21),
                      ('the', 20),
                      ('na', 15),
                      ("'re", 14)],
 'Alternative Rap': [('i', 84),
                     ('it', 73),
                     ('you', 62),
                     ('me', 50),
                     ('to', 44)],
 'Ambient': [('like', 24), ('i', 22), ('it', 13), ('that', 12), ('you', 10)],
 'American Trad Rock': [('i', 16),
                        ('of'

### OutPut Dataset

In [186]:
import json

json_object = json.dumps(sorted_count_by_genre, indent=4)
 
# Writing to data/word_count_by_genre.json
with open("data/word_count_by_genre.json", "w") as outfile:
    outfile.write(json_object)

In [184]:
# Store the lyric data set so we don't have to scrape again
json_object_2 = json.dumps(songs_with_metadata, indent=4)
with open("data/tracks_with_lyrics.json", "w") as outfile:
    outfile.write(json_object_2)