In this notebook, the songs that are not covered in the first notebook are acquired by searching on the ISRC, an non-proprietary song identifier. This second strategy is much preferred, but when we acquired the data, we stopped refining the process for acquiring the data. :)


In [None]:
import pandas as pd
import pprint

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
working_tracks_df = pd.read_csv('/content/drive/MyDrive/2024_11_14_working_tracks_df.csv')

In [None]:
original_tracks_list_df =  pd.read_csv('/content/drive/MyDrive/Most Streamed Spotify Songs 2024.csv',encoding='ISO-8859-1')

In [None]:
working_tracks_df.rename(columns={'Unnamed: 0':'spotify_track_id'}, inplace=True)
df_spotify = pd.merge(left=original_tracks_list_df,
                      right=working_tracks_df,
                      how='left',
                      left_on=['Track','Artist'],
                      right_on=['title','artist'],
                      )

In [None]:
df_spotify['ISRC'].value_counts()

Unnamed: 0_level_0,count
ISRC,Unnamed: 1_level_1
USWL11700269,2
TCAGJ2289254,2
QM24S2402528,1
USRN11900133,1
USSD12300585,1
...,...
TCJPT2289709,1
USWL12302932,1
USUG11903492,1
GBKPL2163189,1


In [None]:
import requests
import base64
import time

# Read client secret and ID from files
with open('/content/drive/MyDrive/rs_client_secret.txt', 'r') as file:
    client_secret = file.read().strip()

with open('/content/drive/MyDrive/rs_client_id.txt', 'r') as file:
    client_id = file.read().strip()

# Encode authorization string
auth_str = f"{client_id}:{client_secret}"
b64_auth_str = base64.b64encode(auth_str.encode()).decode()

# Get access token
auth_headers = {
    "Authorization": f"Basic {b64_auth_str}"
}
auth_data = {
    "grant_type": "client_credentials"
}
auth_response = requests.post("https://accounts.spotify.com/api/token", headers=auth_headers, data=auth_data)
access_token = auth_response.json()['access_token']

# Set headers for subsequent requests
headers = {
    'Authorization': f'Bearer {access_token}'
}

# Define a function to search Spotify by ISRC codes
def search_tracks_by_isrc(isrc_list):
    track_results_by_isrc = {}
    n=0
    for isrc in isrc_list:
        n+=1
        print(f"Processing ISRC {isrc} ({n}/{len(isrc_list)})")

        url = f'https://api.spotify.com/v1/search?q=isrc:{isrc}&type=track'
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            results = response.json().get('tracks', {}).get('items', [])
            if results:
                track_results_by_isrc[isrc] = results[0]  # Assume the first result is the most relevant
                print(f"Retrieved track for ISRC {isrc}")
            else:
                print(f"No track found for ISRC {isrc}")
        else:
            print(f"Error: {response.status_code} - {response.json()}")

        # Sleep to avoid hitting rate limits
        time.sleep(1)

    return track_results_by_isrc

# Example usage
missing_isrc_list=list(set(df_spotify[df_spotify['spotify_track_id'].isnull()]['ISRC']))
tracks_by_isrc = search_tracks_by_isrc(missing_isrc_list)


Processing ISRC USWL12301086 (1/383)
Retrieved track for ISRC USWL12301086
Processing ISRC USUM72201829 (2/383)
Retrieved track for ISRC USUM72201829
Processing ISRC USUYG1156898 (3/383)
Retrieved track for ISRC USUYG1156898
Processing ISRC USUM72402906 (4/383)
Retrieved track for ISRC USUM72402906
Processing ISRC USAT21102141 (5/383)
Retrieved track for ISRC USAT21102141
Processing ISRC USWB12301125 (6/383)
Retrieved track for ISRC USWB12301125
Processing ISRC THSOH2301617 (7/383)
Retrieved track for ISRC THSOH2301617
Processing ISRC ARIXB2303741 (8/383)
No track found for ISRC ARIXB2303741
Processing ISRC DEE861701287 (9/383)
No track found for ISRC DEE861701287
Processing ISRC BRWMB2400226 (10/383)
Retrieved track for ISRC BRWMB2400226
Processing ISRC QZ5AB2358733 (11/383)
Retrieved track for ISRC QZ5AB2358733
Processing ISRC USSD12300400 (12/383)
Retrieved track for ISRC USSD12300400
Processing ISRC SGB502208098 (13/383)
No track found for ISRC SGB502208098
Processing ISRC CYA11200

In [None]:
with open('/content/drive/MyDrive/tracks_by_isrc.txt', 'w') as file:
    file.write(str(tracks_by_isrc))

In [None]:
new_track_ids = [(k,v['id']) for k,v in tracks_by_isrc.items()]

In [None]:
new_tracks_df = pd.DataFrame(new_track_ids,columns=['ISRC','spotify_track_id'])
new_tracks_df.to_csv('/content/drive/MyDrive/new_track_ids.csv',index=False)

In [None]:
headers = {
    'Authorization': f'Bearer {access_token}'
}

data = {
    "grant_type": "client_credentials"
}
remaining_ids = new_tracks_df['spotify_track_id']
audio_features_by_track={}
# Chunk the list of IDs into batches of 50
chunk_size = 50
remaining_ids_chunks = [remaining_ids[i:i + chunk_size] for i in range(0, len(remaining_ids), chunk_size)]

for chunk in remaining_ids_chunks:
    # Join the chunk of IDs into a comma-separated string
    ids = ','.join(chunk)

    # Make the request to the Audio Features endpoint with the batch of IDs
    url = f'https://api.spotify.com/v1/audio-features?ids={ids}'
    response = requests.get(url, headers=headers)

    # Parse the JSON response
    if response.status_code == 200:
        audio_features_list = response.json().get('audio_features', [])
        for audio_features in audio_features_list:
            if audio_features:  # Some entries might be None if the ID is invalid
                track_id = audio_features['id']
                audio_features_by_track[track_id] = audio_features
                print(f"Retrieved audio features for Track ID {track_id}")
    else:
        print(f"Error: {response.status_code} - {response.json()}")

    # Sleep to avoid hitting rate limits
    time.sleep(1)  # Adjust as needed

Retrieved audio features for Track ID 3oNnzH6hmqIGIhJ1NcHlrh
Retrieved audio features for Track ID 6DoL1yYIwEW7VZMRaJhoJI
Retrieved audio features for Track ID 7AQim7LbvFVZJE3O8TYgf2
Retrieved audio features for Track ID 72t57kCoFddPNDRMKTIuNL
Retrieved audio features for Track ID 6lanRgr6wXibZr8KgzXxBl
Retrieved audio features for Track ID 3vcLw8QA3yCOkrj9oLSZNs
Retrieved audio features for Track ID 2Cat5w7hj4K9I8Y00jkQ0U
Retrieved audio features for Track ID 6xW61JP2f3hcsbOwn7dzi5
Retrieved audio features for Track ID 2Ul4zL7sooJUmWrA8n6vlE
Retrieved audio features for Track ID 4bw8mcDUSRWfQo63ZTYRnU
Retrieved audio features for Track ID 3j11iDncb7ZeDMw7lFucqM
Retrieved audio features for Track ID 36t6frENUtCYKuZus6aYDO
Retrieved audio features for Track ID 3CeCwYWvdfXbZLXFhBrbnf
Retrieved audio features for Track ID 35FFqjqaPv1Fr9B1GtJdZO
Retrieved audio features for Track ID 3Gpffv3gaD1UxQPeElIjCp
Retrieved audio features for Track ID 5v4GgrXPMghOnBBLmveLac
Retrieved audio features

In [None]:
afdf=pd.DataFrame(audio_features_by_track).T

afdf.to_csv('/content/drive/MyDrive/audio_features_by_track_missing_ids.csv')

In [None]:
# Better to start from scratch entirely with the new list of track_ids

In [None]:
working_tracks_df = pd.read_csv('/content/drive/MyDrive/2024_11_14_working_tracks_df.csv')

In [None]:
additional_tracks_df = pd.read_csv('/content/drive/MyDrive/new_track_ids.csv')

In [None]:
original_tracks_list_df =  pd.read_csv('/content/drive/MyDrive/Most Streamed Spotify Songs 2024.csv',encoding='ISO-8859-1')

In [None]:
all_track_ids = list(set(additional_tracks_df['spotify_track_id'].tolist() + working_tracks_df['Unnamed: 0'].tolist()))
len(all_track_ids),original_tracks_list_df.shape

(4415, (4600, 29))

In [None]:
headers = {
    'Authorization': f'Bearer {access_token}'
}

data = {
    "grant_type": "client_credentials"
}
remaining_ids = all_track_ids
audio_features_by_track={}
# Chunk the list of IDs into batches of 50
chunk_size = 50
remaining_ids_chunks = [remaining_ids[i:i + chunk_size] for i in range(0, len(remaining_ids), chunk_size)]

for chunk in remaining_ids_chunks:
    # Join the chunk of IDs into a comma-separated string
    ids = ','.join(chunk)

    # Make the request to the Audio Features endpoint with the batch of IDs
    url = f'https://api.spotify.com/v1/audio-features?ids={ids}'
    response = requests.get(url, headers=headers)

    # Parse the JSON response
    if response.status_code == 200:
        audio_features_list = response.json().get('audio_features', [])
        for audio_features in audio_features_list:
            if audio_features:  # Some entries might be None if the ID is invalid
                track_id = audio_features['id']
                audio_features_by_track[track_id] = audio_features
                print(f"Retrieved audio features for Track ID {track_id}")
    else:
        print(f"Error: {response.status_code} - {response.json()}")

    # Sleep to avoid hitting rate limits
    time.sleep(1)  # Adjust as needed

Retrieved audio features for Track ID 1fidCEsYlaVE3pHwKCvpFZ
Retrieved audio features for Track ID 61JrU3c22ur5C2TREkHkm3
Retrieved audio features for Track ID 1SC5rEoYDGUK4NfG82494W
Retrieved audio features for Track ID 1XDPh9pRlOsMf5iLMakSqT
Retrieved audio features for Track ID 44gRhRi2OhEf7moAUj6MD1
Retrieved audio features for Track ID 3elpkpwYlmUbuvmVps08lJ
Retrieved audio features for Track ID 0kTMK4gNFfLXaTb62w1UaJ
Retrieved audio features for Track ID 3dGLEQM2mDkHwtA37F4bPx
Retrieved audio features for Track ID 68Dni7IE4VyPkTOH9mRWHr
Retrieved audio features for Track ID 4RAR8g8fZNB106ezUurnE0
Retrieved audio features for Track ID 2Qzt61URbXBb9N2MSVDoDv
Retrieved audio features for Track ID 3KCNiDi9Pza6ZD8FggNoaw
Retrieved audio features for Track ID 5sfq1jt5iX3Xxp2S97SzdK
Retrieved audio features for Track ID 02wk5BttM0QL38ERjLPQJB
Retrieved audio features for Track ID 1p0rEzrK7YtdRZVtiyV7RN
Retrieved audio features for Track ID 5W4kiM2cUYBJXKRudNyxjW
Retrieved audio features

In [None]:
with open('/content/drive/MyDrive/audio_features_by_track_all_ids.txt', 'w') as file:
    file.write(str(audio_features_by_track))

In [None]:
audio_features_df = pd.DataFrame(audio_features_by_track).T

In [None]:
audio_features_df.to_csv('/content/drive/MyDrive/audio_features_df_2024_11_26.csv')