<a href="https://colab.research.google.com/github/Tarek-1/Music_Recommender_System/blob/main/Spotify_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Purpose

The purpose of this notebook is to enhance two Spotify Audio Features datasets obtained from Kaggle (April 2019 and November 2018).  
I am working on adding lyrics to the tracks, followed by additional columns using the Spotify API.  
This enriched dataset will be used later to build a music recommendation system.


In [None]:
!pip install spotipy
!pip install python-dotenv
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from tqdm import tqdm
import time
import os
import requests
from dotenv import load_dotenv

# Reading Datasets

In [None]:
import pandas as pd
spotify_april_2019 = pd.read_csv(f"/content/drive/MyDrive/SpotifyAudioFeaturesApril2019.csv")
spotify_nov_2018 = pd.read_csv(f"/content/drive/MyDrive/SpotifyAudioFeaturesNov2018.csv")

print("April 2019 shape:", spotify_april_2019.shape)
print("Nov 2018 shape:", spotify_nov_2018.shape)

April 2019 shape: (130663, 17)
Nov 2018 shape: (116372, 17)


In [None]:
common_cols = set(spotify_april_2019.columns) & set(spotify_nov_2018.columns)
len(common_cols)

17

# Both datasets have the same columns, so we will merge them to make one big dataset

In [None]:
df_big = pd.concat([spotify_april_2019, spotify_nov_2018], ignore_index=True)

In [None]:
df_big.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [None]:
df_big.shape

(247035, 17)

# Checking for duplicates

I looked for duplicates based only on the `track_id` column. This allowed me to identify tracks that appear multiple times even if other columns slightly differ.

As an example, the track ID `'2kF08PgGgb3B08QNqsLcov1'` appears four times. All instances have the same values for all features except for minor differences in the `popularity` column.

Next, I will decide on a strategy for handling such cases — likely retaining a single row per track_id by keeping the most recent one, and later I will try to update the popularity column using the Spotify API if possible.

In [None]:
df_big.duplicated(subset='track_id').sum()

np.int64(116046)

In [None]:
df_big[df_big.duplicated(subset='track_id')].head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
1151,I DO,2kFO8PgGgB308QNqsLcov1,Woke Up In Kingston,0.119,0.69,175136,0.694,0.0,4,0.075,-4.288,0,0.318,131.666,5,0.318,38
1574,Courtney Barnett,4U8JFGd4EZ6w42A6CU1fDj,Crippling Self Doubt And A General Lack Of Sel...,3.1e-05,0.552,168360,0.726,3e-06,9,0.0642,-5.056,1,0.027,127.084,3,0.735,41
1936,Nick Grant,4uefRUWMKRD1utYD97uYyN,Black Woman (feat. Stacy Barthe),0.75,0.483,256153,0.75,0.0,1,0.144,-6.492,1,0.439,79.792,4,0.34,32
2683,Jazz Cartier,6HKmRDdB3jMYRrd8TfFsKM,GODFLOWER,0.0269,0.646,180227,0.823,2.5e-05,10,0.111,-4.845,0,0.0445,74.993,4,0.325,42
3344,Kygo,0zhBS3T33cnJUwfoOEmlCL,Kids in Love - The Him Remix,0.104,0.532,195920,0.723,9e-06,0,0.114,-4.691,1,0.0449,125.071,4,0.181,48


In [None]:
df_big[df_big['track_id'] == '2kFO8PgGgB308QNqsLcov1']

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
463,I Do,2kFO8PgGgB308QNqsLcov1,Woke Up In Kingston,0.119,0.69,175136,0.694,0.0,4,0.075,-4.288,0,0.318,131.666,5,0.318,38
1151,I DO,2kFO8PgGgB308QNqsLcov1,Woke Up In Kingston,0.119,0.69,175136,0.694,0.0,4,0.075,-4.288,0,0.318,131.666,5,0.318,38
131137,I Do,2kFO8PgGgB308QNqsLcov1,Woke Up In Kingston,0.119,0.69,175136,0.694,0.0,4,0.075,-4.288,0,0.318,131.666,5,0.318,37
131858,I DO,2kFO8PgGgB308QNqsLcov1,Woke Up In Kingston,0.119,0.69,175136,0.694,0.0,4,0.075,-4.288,0,0.318,131.666,5,0.318,37


In [None]:
df_big_deduped = df_big.sort_values('popularity', ascending=False).drop_duplicates(subset='track_id').reset_index(drop=True)
df_big_deduped.duplicated(subset='track_id').sum()

np.int64(0)

# Checking null values

In [None]:
df_big_deduped.isnull().sum()

Unnamed: 0,0
artist_name,0
track_id,0
track_name,1
acousticness,0
danceability,0
duration_ms,0
energy,0
instrumentalness,0
key,0
liveness,0


Only 1 record is null so we can drop it

In [None]:
df_big_deduped = df_big_deduped.dropna(subset=['track_name']).reset_index(drop=True)
df_big_deduped.isnull().sum()

Unnamed: 0,0
artist_name,0
track_id,0
track_name,0
acousticness,0
danceability,0
duration_ms,0
energy,0
instrumentalness,0
key,0
liveness,0


# Get lyrics for all the songs

In [None]:
output_path = "/content/drive/MyDrive/all_song_lyrics_progress.csv"

# If file exists, load it to skip already done songs
if os.path.exists(output_path):
    done_df = pd.read_csv(output_path)
    done_set = set(zip(done_df["artist"], done_df["track"]))
else:
    done_df = pd.DataFrame(columns=["artist", "track", "lyrics"])
    done_set = set()

# Function to get lyrics
def get_lyrics(artist, title):
    url = f"https://api.lyrics.ovh/v1/{artist}/{title}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get("lyrics", "Lyrics not found.")
        else:
            return f"Error {response.status_code}"
    except Exception as e:
        return str(e)

# Fetch and append
for i, row in df_big_deduped.iterrows():
    artist = row["artist_name"]
    track = row["track_name"]
    key = (artist, track)

    if key in done_set:
        continue  # Skip already processed

    lyrics = get_lyrics(artist, track)
    new_row = pd.DataFrame([{"artist": artist, "track": track, "lyrics": lyrics}])
    done_df = pd.concat([done_df, new_row], ignore_index=True)

    # Save progress after each song
    done_df.to_csv(output_path, index=False)
    print(f"[{i+1}] Saved: {artist} - {track}")
    time.sleep(1)

# Utilizing Spotify API


In [None]:
# load_dotenv("/content/drive/MyDrive/.env.txt")

# CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
# CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
# REDIRECT_URI = os.getenv("SPOTIPY_REDIRECT_URI")

In [None]:
# sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
#     client_id=CLIENT_ID,
#     client_secret=CLIENT_SECRET,
#     redirect_uri=REDIRECT_URI,
# ))

In [None]:
# progress_path = "/content/drive/MyDrive/spotify_metadata_progress.csv"

# # Get track IDs
# track_ids = df_big_deduped['track_id'].dropna().tolist()

# # Resume from Drive if exists
# if os.path.exists(progress_path):
#     df_meta = pd.read_csv(progress_path)
#     processed_ids = set(df_meta['track_id'].tolist())
#     print(f" Resuming: {len(processed_ids)} tracks already processed.")
# else:
#     df_meta = pd.DataFrame()
#     processed_ids = set()
#     print(" Starting fresh.")

# metadata = []
# batch_size = 100

# for i, track_id in enumerate(tqdm(track_ids), 1):
#     if track_id in processed_ids:
#         continue

#     try:
#         track = sp.track(track_id)
#         album = sp.album(track['album']['id'])
#         artist = sp.artist(track['artists'][0]['id'])

#         meta = {
#             'track_id': track_id,
#             'track_name': track.get('name'),
#             'track_popularity': track.get('popularity'),
#             'track_number': track.get('track_number'),
#             'artist_ids': '; '.join([a['id'] for a in track['artists']]),
#             'artist_names': '; '.join([a['name'] for a in track['artists']]),
#             'artist_popularity': artist.get('popularity'),
#             'artist_genres': '; '.join(artist.get('genres', [])),

#             'album_id': album['id'],
#             'album_name': album.get('name'),
#             'album_release_date': album.get('release_date'),
#             'album_type': album.get('album_type'),

#             'isrc': track.get('external_ids', {}).get('isrc'),
#             'track_spotify_url': track['external_urls'].get('spotify')
#         }

#         metadata.append(meta)
#         time.sleep(0.1)  # API rate protection

#     except Exception as e:
#         print(f" Error on track {track_id}: {e}")
#         continue  # Move to the next track in case of an error

#     # Save every batch
#     if len(metadata) >= batch_size:
#         df_new = pd.DataFrame(metadata)
#         df_meta = pd.concat([df_meta, df_new], ignore_index=True).drop_duplicates(subset='track_id')
#         df_meta.to_csv(progress_path, index=False)
#         print(f" Saved {len(df_meta)} total tracks to Drive.")
#         metadata = []

# # Final save
# if metadata:
#     df_new = pd.DataFrame(metadata)
#     df_meta = pd.concat([df_meta, df_new], ignore_index=True).drop_duplicates(subset='track_id')
#     df_meta.to_csv(progress_path, index=False)
#     print(f" Final save complete: {len(df_meta)} tracks saved.")