# Fetching Music Data with Spotipy and Lyricsgenius

## Import libraries

In [10]:
import spotipy
import pandas as pd
import lyricsgenius as lg
import re
from spotipy.oauth2 import SpotifyClientCredentials

## Getting Song Infos From the Spotify API

In [11]:
# Credentials from your developer account
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="CLIENT ID",
                                                           client_secret="CLIENT SECRET"))

In [12]:
# Search for Radiohead's artist information
artist_name = "Radiohead"
results = sp.search(q=artist_name, type='artist')
artist = results['artists']['items'][0]  # Assuming Radiohead is the first result
artist_id = artist['id']

# Create an empty list to store the data for each track
data = []

# Retrieve the list of albums by Radiohead
albums = sp.artist_albums(artist_id, album_type='album')

# Loop through the albums and retrieve audio features for each track
for album in albums['items']:
    album_name = album['name']
    album_id = album['id']
    tracks = sp.album_tracks(album_id)
    for track in tracks['items']:
        track_name = track['name']
        track_id = track['id']
        track_number = track['track_number']
        duration_ms = track['duration_ms']
        release_year = album['release_date'].split('-')[0]  # Extract the release year

        # Get audio features for the track
        audio_features = sp.audio_features([track_id])

        if audio_features:
            audio_feature_data = audio_features[0]

            # Append the data for each track to the list
            data.append([artist_name, album_name, track_name, track_number, duration_ms, release_year,
                         audio_feature_data['valence'], audio_feature_data['energy'],
                         audio_feature_data['danceability'], audio_feature_data['acousticness'],
                         audio_feature_data['instrumentalness'], audio_feature_data['speechiness'],
                         audio_feature_data['liveness'], audio_feature_data['tempo'],
                         audio_feature_data['key'], audio_feature_data['mode'], audio_feature_data['time_signature']])

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Artist', 'Album', 'Track', 'TrackNumber_On_Album', 'Duration_ms', 'ReleaseYear', 'Valence', 'Energy', 'Danceability', 'Acousticness', 'Instrumentalness', 'Speechiness', 'Liveness', 'Tempo', 'Key', 'Mode', 'TimeSignature'])




In [13]:
#Filter out the studio albums

radiohead_albums = [
    "Pablo Honey",
    "The Bends",
    "OK Computer",
    "Kid A",
    "Amnesiac",
    "Hail To the Thief",
    "In Rainbows",
    "The King Of Limbs",
    "A Moon Shaped Pool"
]

df = df[df['Album'].isin(radiohead_albums)]

## Adding the lyrics from Genius Lyrics

In [14]:
#Use your API key from your developer setup
api_key = 'YOUR KEY'

#Set up the formatting of the lyrics
genius = lg.Genius(api_key, response_format= 'plain', remove_section_headers= True, excluded_terms=["(Remix)", "(Live)"] )

In [15]:
#Function to fetch lyrics

def fetch_lyrics(title, artist):
    # Use Genius API to search for lyrics
    song = genius.search_song(title, artist)
    if song:
        return song.lyrics
    else:
        return None

In [17]:
#Fetching lyrics for every song in the data frame

df['Lyrics'] = df.apply(lambda row: fetch_lyrics(row['Track'], row['Artist']), axis=1)


Searching for "Burn the Witch" by Radiohead...
Done.
Searching for "Daydreaming" by Radiohead...
Done.
Searching for "Decks Dark" by Radiohead...
Done.
Searching for "Desert Island Disk" by Radiohead...
Done.
Searching for "Ful Stop" by Radiohead...
Done.
Searching for "Glass Eyes" by Radiohead...
Done.
Searching for "Identikit" by Radiohead...
Done.
Searching for "The Numbers" by Radiohead...
Done.
Searching for "Present Tense" by Radiohead...
Done.
Searching for "Tinker Tailor Soldier Sailor Rich Man Poor Man Beggar Man Thief" by Radiohead...
Done.
Searching for "True Love Waits" by Radiohead...
Done.
Searching for "Bloom" by Radiohead...
Done.
Searching for "Morning Mr Magpie" by Radiohead...
Done.
Searching for "Little By Little" by Radiohead...
Done.
Searching for "Feral" by Radiohead...
Done.
Searching for "Lotus Flower" by Radiohead...
Done.
Searching for "Codex" by Radiohead...
Done.
Searching for "Give Up The Ghost" by Radiohead...
Done.
Searching for "Separator" by Radiohead.

In [19]:
#Checking for missing lyrics
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101 entries, 57 to 192
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Artist                101 non-null    object 
 1   Album                 101 non-null    object 
 2   Track                 101 non-null    object 
 3   TrackNumber_On_Album  101 non-null    int64  
 4   Duration_ms           101 non-null    int64  
 5   ReleaseYear           101 non-null    object 
 6   Valence               101 non-null    float64
 7   Energy                101 non-null    float64
 8   Danceability          101 non-null    float64
 9   Acousticness          101 non-null    float64
 10  Instrumentalness      101 non-null    float64
 11  Speechiness           101 non-null    float64
 12  Liveness              101 non-null    float64
 13  Tempo                 101 non-null    float64
 14  Key                   101 non-null    int64  
 15  Mode                  101 n

In [20]:
#Confirming that these are indeed purely instrumental tracks
df.loc[df.loc[:,'Lyrics'].isna()]

Unnamed: 0,Artist,Album,Track,TrackNumber_On_Album,Duration_ms,ReleaseYear,Valence,Energy,Danceability,Acousticness,Instrumentalness,Speechiness,Liveness,Tempo,Key,Mode,TimeSignature,Lyrics
143,Radiohead,Amnesiac,Hunting Bears,9,121200,2001,0.0736,0.264,0.295,0.853,0.926,0.0432,0.0962,143.191,7,1,3,
150,Radiohead,Kid A,Treefingers,5,222600,2000,0.0577,0.146,0.165,0.827,0.887,0.0362,0.109,134.508,6,1,3,
156,Radiohead,Kid A,Untitled,11,52694,2000,0.0769,0.225,0.369,0.992,0.813,0.0654,0.106,64.655,7,1,3,


In [21]:
#Function to clean lyrics

def clean_lyrics(lyrics):
    if lyrics is None:
        return None
    
    # Use the str.find and str.slice methods to extract the lyrics
    lyrics_start = lyrics.find('Lyrics\n')
    
    if lyrics_start != -1:
        cleaned_lyrics = lyrics[lyrics_start + len('Lyrics\n'):]
        # Make lyrics lowercase, replace '\n' with ' ', and remove two random digits + 'embed' at the end
        cleaned_lyrics = cleaned_lyrics.lower().replace('\n', ' ')
        
        # Use a regular expression to remove 'embed' with one, two, or three digits in front of it
        cleaned_lyrics = re.sub(r'(\d{1,3}embed\b)', '', cleaned_lyrics)

                # Remove zero-width space and 'you might also like'
        cleaned_lyrics = cleaned_lyrics.replace('\u200b', '').replace('you might also like', '').replace("'","´" )

        # Remove all punctuation except for the single quote
        cleaned_lyrics = re.sub(r'[^\w\s\']', '', cleaned_lyrics)

    else:
        cleaned_lyrics = lyrics

    return cleaned_lyrics

In [22]:
#Applying cleaning function

df['Lyrics'] = df['Lyrics'].apply(clean_lyrics)


In [23]:
# Save dataframe to a CSV file
df.to_csv('radiohead_spotify_lyricsgenius.csv', index=False)