In [1]:
import musicbrainzngs
from musicbrainzngs import WebServiceError

musicbrainzngs.set_useragent("Nouserentered Song Analysis Project", "0.1.0", contact="matthew.shu@yale.edu"
                             )

In [2]:
from difflib import SequenceMatcher

# Modified from: https://stackoverflow.com/a/17388505
def match(orig_name, found_name):
    return SequenceMatcher(None, orig_name, found_name).ratio() > .7

In [3]:
def search_artist(artist_id: int, name: str):
    try:
        result = musicbrainzngs.search_artists(name)
    except WebServiceError as exc:
        print("Something went wrong with the request: %s" % exc)
    else:
        artist = result["artist-list"]
        if len(artist) != 0 and match(artist[0]["name"], name):
            # Currently just takes the top artist, isn't always great if there are multiple artists!
            top_artist = artist[0]
            top_artist["artist_id"] = artist_id
            return top_artist
        else:
            return {"name": name, "artist_id": artist_id}
def search_artist_series(artist_id: int, name: str):
    artist_data = search_artist(artist_id, name)
    if artist_data:
        return pd.json_normalize(artist_data).iloc[0]
    else:
        return

# Load Data

In [4]:
%load_ext dotenv
%dotenv

import os
import sqlite3
import pandas as pd

In [5]:
from tqdm.auto import tqdm
tqdm.pandas()

In [6]:
SQL_FILEPATH = os.getenv("PLAYBACK_FILE")
lyrics_db = sqlite3.connect(SQL_FILEPATH)

df = pd.read_sql_query("SELECT * FROM tracks", lyrics_db)
print(df.head())

   index  track_id  year           artist                        track  rank  \
0      0         0  1985   USA For Africa             We Are the World     1   
1      1         1  1985             A-Ha                   Take On Me     2   
2      2         2  1985        Foreigner  I Want to Know What Love Is     3   
3      3         3  1985  Tears For Fears                        Shout     4   
4      4         4  1985    Lionel Richie              Say You, Say Me     5   

                                                link  \
0  /charts/top-100-songs/video/1985/USA-For-Afric...   
1    /charts/top-100-songs/video/1985/AHa-Take-On-Me   
2  /charts/top-100-songs/video/1985/Foreigner-I-W...   
3  /charts/top-100-songs/video/1985/Tears-For-Fea...   
4  /charts/top-100-songs/video/1985/Lionel-Richie...   

                                              lyrics  
0  We Are the World Lyrics\nThere comes a time wh...  
1                                               None  
2  I Want to Know

# Search Musicbrainz

In [7]:
small_df = df.head()
small_df

Unnamed: 0,index,track_id,year,artist,track,rank,link,lyrics
0,0,0,1985,USA For Africa,We Are the World,1,/charts/top-100-songs/video/1985/USA-For-Afric...,We Are the World Lyrics\nThere comes a time wh...
1,1,1,1985,A-Ha,Take On Me,2,/charts/top-100-songs/video/1985/AHa-Take-On-Me,
2,2,2,1985,Foreigner,I Want to Know What Love Is,3,/charts/top-100-songs/video/1985/Foreigner-I-W...,I Want to Know What Love Is Lyrics\nI've gotta...
3,3,3,1985,Tears For Fears,Shout,4,/charts/top-100-songs/video/1985/Tears-For-Fea...,"Shout Lyrics\nShout, shout, let it all out\nTh..."
4,4,4,1985,Lionel Richie,"Say You, Say Me",5,/charts/top-100-songs/video/1985/Lionel-Richie...,"Say You, Say Me Lyrics\nSay you, say me\nSay i..."


In [8]:
# https://datascience.stackexchange.com/a/89267
df['artist_id'] = pd.factorize(df['artist'])[0] + 1 # just don't want to start at 0

In [9]:
artists = df.artist.unique()
artists_head = artists
print(artists_head.size)

1811


In [10]:
df_unique_artists = df.groupby('artist', as_index=False).first()

In [11]:
artists_df = df_unique_artists.progress_apply(lambda row: search_artist_series(row["artist_id"], row["artist"]), axis=1)

  0%|          | 0/1811 [00:00<?, ?it/s]

In [12]:
artists_df.to_pickle("22-04-27-artists-pickle")

In [13]:
artists_df.columns = [i.replace('-', '_') for i in artists_df.columns]

In [14]:
df['artist_appearances'] = df.groupby('artist_id')['artist_id'].transform('count')

In [15]:
artists_df = artists_df.rename(columns={"id": "mb_id"})

In [16]:
artists_df.applymap(str).to_sql("artists", lyrics_db, if_exists="replace")
df.to_sql("tracks", lyrics_db, if_exists="replace")

3595