# Using Genius API to obtain full-text lyrics and language information for our song dataset

In [31]:
import pandas as pd
import numpy as np
from lyricsgenius import Genius
from dotenv import load_dotenv
import os

In [32]:
data_dir = "data"

In [33]:
no_lyric_data = pd.read_csv(os.path.join(data_dir, "msd_mxm+tagtraum.csv"))
no_lyric_data

Unnamed: 0,msd_tid,mxm_tid,title,artist,genre,is_test
0,TRAAAED128E0783FAB,2516445,It's About Time,Jamie Cullum,Jazz,0
1,TRAAAEF128F4273421,3759847,Something Girls,Adam Ant,Rock,0
2,TRAAAGF12903CEC202,5493388,Små ord,Halvdan Sivertsen,Pop,0
3,TRAAAHZ128E0799171,1619153,The One and Only,Snoop Dogg,Rap,0
4,TRAAARJ128F9320760,1422131,Pink World,Planet P Project,Rock,0
...,...,...,...,...,...,...
83187,TRZZSXX128F93262C9,2736386,My Mother Was a Chinese Trapeze Artist,Tarkio,Rock,1
83188,TRZZUTD12903CADD68,8852681,Solo Dolo (Nightmare),Kid Cudi,Rap,1
83189,TRZZWEM128F428BD9A,1441760,Operator's Manual,Buzzcocks,Punk,1
83190,TRZZXOQ128F932A083,4292070,After,Riverside,Rock,1


In [34]:
load_dotenv()
genius = Genius()
genius.verbose = False # Turn off status messages
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching

## Defining function for fetching lyrics

The following function will be used in conjuction with the `apply()` function of the dataframe to fetch lyrics for every row of the dataset:

In [35]:
def get_lyrics(row):
  result = genius.search_song(row.title, row.artist, get_full_info=True)
  if result is not None:
    data = result.to_dict()
    return pd.Series([row.msd_tid, data["language"], data["lyrics"]], index=['msd_tid', 'language', 'lyrics'])
  else:
    return pd.Series([row.msd_tid, None, None], index=['msd_tid', 'language', 'lyrics'])

a quick demo:

In [45]:
lyrics = no_lyric_data[:3].apply(get_lyrics, axis=1, result_type="expand")

In [46]:
lyrics

Unnamed: 0,msd_tid,language,lyrics
0,TRAAAED128E0783FAB,en,It’s About Time Lyrics\nWalking down to the wa...
1,TRAAAEF128F4273421,en,Something Girls LyricsEvery girl is a somethin...
2,TRAAAGF12903CEC202,no,"Små ord LyricsSmå ord, mjuke ord\nEt øre mot e..."


In [38]:
np.array_split(no_lyric_data, 200)[0]

Unnamed: 0,msd_tid,mxm_tid,title,artist,genre,is_test
0,TRAAAED128E0783FAB,2516445,It's About Time,Jamie Cullum,Jazz,0
1,TRAAAEF128F4273421,3759847,Something Girls,Adam Ant,Rock,0
2,TRAAAGF12903CEC202,5493388,Små ord,Halvdan Sivertsen,Pop,0
3,TRAAAHZ128E0799171,1619153,The One and Only,Snoop Dogg,Rap,0
4,TRAAARJ128F9320760,1422131,Pink World,Planet P Project,Rock,0
...,...,...,...,...,...,...
411,TRADCJT12903CB102B,3837070,"Work, Work, Work (Pub, Club, Sleep)",The Rakes,Rock,0
412,TRADCNP12903CFDBFA,8115972,Little Heartwrecker,Dierks Bentley,Country,0
413,TRADDAQ12903CBBEA4,9977714,Invisible,The Letter Black,Rock,0
414,TRADDBB128F92DFBE5,7726015,Fall,The Saturdays,Pop,0


## Split dataframe into chunks

Let's split the dataframe into chunks of size 100.

In [39]:
len(no_lyric_data)

83192

In [40]:
first_83100 = no_lyric_data[:83100]
rest = no_lyric_data[83100:]

chunks = np.array_split(first_83100, 830)
chunks.append(rest)
chunks

[                msd_tid  mxm_tid                   title               artist  \
 0    TRAAAED128E0783FAB  2516445         It's About Time         Jamie Cullum   
 1    TRAAAEF128F4273421  3759847         Something Girls             Adam Ant   
 2    TRAAAGF12903CEC202  5493388                 Små ord    Halvdan Sivertsen   
 3    TRAAAHZ128E0799171  1619153        The One and Only           Snoop Dogg   
 4    TRAAARJ128F9320760  1422131              Pink World     Planet P Project   
 ..                  ...      ...                     ...                  ...   
 96   TRAARXB128F92FA0BF  2505393  Blackmail the Universe             Megadeth   
 97   TRAARXD12903D0CF24  1599155           Born to Booze  Black Label Society   
 98   TRAASHC128F427ED9F  5662996               Up to You           David Kitt   
 99   TRAASTP128F4294FD9  1653532             Lonely Mile       Rory Gallagher   
 100  TRAATIP12903CB1B56  9465406             Clear Skies                Keane   
 
      genre  i

## Fetch lyrics for each chunk, write to file

We'll use `swifter` to (hopefully) make the processing of the chunks faster with parallelization and other optimisations.

In [41]:
pip install swifter

Note: you may need to restart the kernel to use updated packages.


Create a folder where we write the chunks to:

In [42]:
lyric_chunks_folder_path = os.path.join(data_dir, "lyric_chunks")
if not os.path.exists(lyric_chunks_folder_path):
  os.mkdir(lyric_chunks_folder_path)

In [44]:
import swifter

for i, chunk in enumerate(chunks):
  print(f"processing chunk {i} (indices {chunk.index[0]}-{chunk.index[-1]})")
  lyrics = chunk.apply(get_lyrics, axis=1, result_type="expand")
  lyrics.to_json(os.path.join(lyric_chunks_folder_path, f"chunk_{i}_({chunk.index[0]}-{chunk.index[-1]}).json"), orient="index")

processing chunk 0 (indices 0-100)


Pandas Apply:   0%|          | 0/101 [00:00<?, ?it/s]

processing chunk 1 (indices 101-201)


Pandas Apply:   0%|          | 0/101 [00:00<?, ?it/s]

processing chunk 2 (indices 202-302)


Pandas Apply:   0%|          | 0/101 [00:00<?, ?it/s]

Timeout: Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)