# Group 8 Final Project

In this file, we will go through all the steps required to replicate our data set: words counts for different genres of popular music. To do this, simply step through the entire jupyter notebook and the resulting file "word_count_by_genre.json" should resemble our dataset. 

NOTE: We are collecting data on the top songs in the last 7 days. Our dataset was created in December 2022 so any future replication will likely vary based on what the top songs are at that time. 

### Setup Imports and API Authentication

In [1]:
import requests
import pprint as pp

# Sign Up to get API key: https://developer.musixmatch.com/signup
api_key = "<provide your API key>"

### Retrieve Song List

In [2]:
# API Call to get list of songs with Artist Name and Song Name
api_base_url = "https://api.musixmatch.com/ws/1.1/"

def get_songs(chart_name='top', num_songs='10', country='US'):
    track_list = []
    # Maximum page size is 100 so if we want more than 100 sounds we need to calculate the number of pages we need to retrieve
    page_nums = (num_songs // 100) + 1
    for page_num in range(1,page_nums+1):
        if num_songs <= 0:
            break
        page_size = num_songs if num_songs < 100 else 100
        # API reference: https://developer.musixmatch.com/documentation/api-reference/track-chart-get
        res = requests.get(api_base_url + f"chart.tracks.get?apikey={api_key}&chart_name={chart_name}&page={page_num}&page_size={page_size}&country={country}")
        track_list += res.json()['message']['body']['track_list']
        num_songs -= 100
    return track_list

# Get songs for data set
list_of_songs = get_songs('mxmweekly', 1000, 'US')

# Check the size of our track list and the first 10 artists and songs
print(len(list_of_songs))
pp.pprint([(track['track']['artist_name'],track['track']['track_name']) for track in list_of_songs][:10])

1000
[('Adam Sandler', 'The Thanksgiving Song'),
 ('TheFatRat feat. Laura Brehm', 'Monody (feat. Laura Brehm)'),
 ('MC L da Vinte feat. MC Gury', 'Parado No Bailão'),
 ('Cloverton', 'A Hallelujah Christmas'),
 ('Kristen Bell feat. Agatha Lee Monn & Katie Lopez',
  'Do You Want to Build a Snowman?'),
 ('Leonard Cohen', 'Hallelujah'),
 ('Rihanna', 'California King Bed'),
 ('Rick Astley', 'Never Gonna Give You Up'),
 ('Taylor Swift', 'Anti-Hero'),
 ('Chris Tomlin', 'How Great Is Our God')]


### Scrap Lyric Data

In [3]:
from bs4 import BeautifulSoup
import re


def scrape_song_lyrics(artistname, songname):
    # Format Artist Name for URL
    artistname = str(artistname.replace('feat','and'))
    artistname = str(artistname.replace('&','and'))
    artistname = str(artistname.replace(' ','-'))
    artistname = re.sub(r'[^\w\s-]', '', artistname) 
    
    # Format Song Name for URL
    songname = str(songname.replace('...','-'))
    songname = str(songname.replace(' ','-'))
    songname = re.sub(r'[^\w\s-]', '', songname) 
    
    request_url = f'https://genius.com/{artistname}-{songname}-lyrics'
    page = requests.get(request_url)
    
    # If we fail to get the song lyrics by the URl, we skip this song
    if page.status_code == 404:
        return ""
    
    # Parse lyrics from HTML
    html = BeautifulSoup(page.text, 'html.parser')
    final_lyrics = ""
    lyrics = html.find_all("div", class_="Lyrics__Container-sc-1ynbvzw-6 YYrds")
    for lyric in lyrics:
        s = lyric.get_text(separator=" ")
        
        # Removing Genius anotations and background lyrics
        s = re.sub('\[[^\]]+\] | \([^\)]+\)', '', s)
        
        # Concatenating lyrics into one string
        final_lyrics += s + " "
    return final_lyrics

# Check Scrape Song Lyrics - Print first 50 characters of the following song
lyrics = scrape_song_lyrics("Zach Bryan", "The Good I'll Do")
pp.pprint(lyrics[:50]+"...")

"Well in You The good I'll do The good I'll do Oh, ..."


### Collect and Merge Data

In [4]:
songs_with_metadata = []
for song in list_of_songs:
    
    lyrics = scrape_song_lyrics(song['track']['artist_name'], song['track']['track_name'])
    # Just to keep track of progress
    print("-", end = ' ')
    
    # Add lyrics to data
    song['track']['lyrics'] = lyrics
    songs_with_metadata.append(song)

# Print First 5 songs with artist name, song name and first 50 chars of lyrics
print("\n")
pp.pprint([(track['track']['artist_name'], track['track']['track_name'], track['track']['lyrics'][:50]+"...") for track in songs_with_metadata if track['track']['lyrics']][:5])

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

### Process Data

In [5]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
import string

word_count_by_genre = defaultdict(lambda: defaultdict(lambda:0))

def get_genre(track):
    if 'primary_genres' in track and 'music_genre_list' in track['primary_genres'] and len(track['primary_genres']['music_genre_list']) > 0:
        return track['primary_genres']['music_genre_list'][0]['music_genre']['music_genre_name']
    else:
        return "Unclassified"
    
for song in songs_with_metadata:
    if not song['track']['lyrics']:
        # Skip songs we didn't get lyrics for
        continue
    
    # Tokenize lyrics and remove punctuation
    tokenized_lyrics = word_tokenize(song['track']['lyrics'])
    genre = get_genre(song['track'])
    for word in tokenized_lyrics:
        # Remove any remaining punctuation from the count
        if word in string.punctuation:
            continue
        
        word_count_by_genre[genre][word.lower()] += 1
        
# Diction where a Genre points to a list of word/count tuples sorted by the highest count
sorted_count_by_genre = {k1: sorted(v1.items(), key=lambda x:x[1], reverse=True) for k1, v1 in word_count_by_genre.items()}

# Check top 5 words in each genre
pp.pprint({genre: word_tuples[:5] for genre, word_tuples in sorted_count_by_genre.items()})

{'Adult Alternative': [('love', 22),
                       ('i', 21),
                       ('another', 20),
                       ('my', 16),
                       ('up', 15)],
 'Adult Contemporary': [('you', 177),
                        ('the', 128),
                        ('i', 98),
                        ('and', 76),
                        ('a', 68)],
 'Alternative': [('i', 611),
                 ('you', 504),
                 ('the', 374),
                 ('to', 243),
                 ('and', 237)],
 'Alternative Folk': [('i', 26),
                      ('you', 21),
                      ('the', 20),
                      ('na', 15),
                      ("'re", 14)],
 'Alternative Rap': [('i', 101),
                     ('it', 91),
                     ('you', 79),
                     ("'s", 55),
                     ('me', 52)],
 'Ambient': [('like', 24), ('i', 22), ('it', 13), ('that', 12), ('you', 10)],
 'American Trad Rock': [('i', 16),
                        ('of

### Remove Stop Words

The above output does contain common stop words like a, the, an that may not be what the consumer of this dataset desired. In the next section we can filter the  data dictionary to remoove these stop words found in /data/stop-words.txt

In [16]:
stop_words = [word.strip() for word in open("data/stop-words.txt", "r").readlines()]
sorted_count_by_genre_stop = {}

def filter_stop_words(genre_word_count, stop_words):
    
    genre_stop_word_count = {}
    
    for genre in genre_word_count.keys():
        word_list = [count_pair for count_pair in genre_word_count[genre] if count_pair[0] not in stop_words]
        genre_stop_word_count[genre] = word_list
        
    return genre_stop_word_count


sorted_count_by_genre_stop = filter_stop_words(sorted_count_by_genre,stop_words)
pp.pprint({genre: word_tuples[:5] for genre, word_tuples in sorted_count_by_genre_stop.items()})
    

{'Adult Alternative': [('love', 22),
                       ('another', 20),
                       ('tears', 12),
                       ('used', 12),
                       ('wan', 10)],
 'Adult Contemporary': [('na', 45),
                        ('never', 44),
                        ('gon', 43),
                        ('someone', 34),
                        ('saved', 30)],
 'Alternative': [('know', 90),
                 ('oh', 77),
                 ('like', 57),
                 ('never', 43),
                 ('tell', 41)],
 'Alternative Folk': [('na', 15),
                      ('got', 8),
                      ('gon', 8),
                      ('wan', 7),
                      ('dark', 6)],
 'Alternative Rap': [('like', 28),
                     ('hey', 27),
                     ('ya', 27),
                     ('right', 19),
                     ('baby', 15)],
 'Ambient': [('like', 24), ('could', 6), ('lie', 6), ('say', 6), ('let', 5)],
 'American Trad Rock': [('tired', 7),
 

### OutPut Dataset

In [17]:
import json

json_object = json.dumps(sorted_count_by_genre, indent=4)
 
# Writing to data/word_count_by_genre.json
with open("data/word_count_by_genre.json", "w") as outfile:
    outfile.write(json_object)

### OutPut Dataset without stop words

In [18]:
import json

json_object_stop = json.dumps(sorted_count_by_genre_stop, indent=4)
 
# Writing to data/word_count_by_genre.json
with open("data/word_count_by_genre_stop.json", "w") as outfile:
    outfile.write(json_object_stop)

In [184]:
# Store the lyric data set so we don't have to scrape again to run processing
# NOTE: This file is not including with data set but it can be recreated using the above code
json_object_2 = json.dumps(songs_with_metadata, indent=4)
with open("data/tracks_with_lyrics.json", "w") as outfile:
    outfile.write(json_object_2)