In [1]:
# Important file
# Since my dataset had songs but no album covers for them
# This script does that, it continously calls the itunes API to get song covers
import pandas as pd
import requests
import time
from tqdm import tqdm
import ast
import os

INPUT_FILE = "../dataset/chunks/testing.csv"
OUTPUT_FILE = "songs_fetched_part_1-.csv"
CHECKPOINT_INTERVAL = 500  # Updated to 500
SLEEP_BETWEEN_REQUESTS = 1.0 

def parse_artist(artists_str):
    if not artists_str or pd.isna(artists_str):
        return ""
    try:
        parsed = ast.literal_eval(artists_str)
        if isinstance(parsed, list) and parsed:
            return str(parsed[0])
    except Exception:
        pass
    return str(artists_str)

def fetch_itunes_cover_persistent(session, song_name, artist_name):
    """
    Persistent fetcher: Only returns on 200 (Success) or 200 (Empty Results).
    Retries infinitely on 429, 403, or connection errors.
    """
    backoff_time = 30 
    
    while True:
        try:
            params = {"term": f"{song_name} {artist_name}", "media": "music", "entity": "song", "limit": 1}
            r = session.get("https://itunes.apple.com/search", params=params, timeout=15)
            
            if r.status_code == 200:
                data = r.json()
                results = data.get("results", [])
                if results:
                    return results[0].get("artworkUrl100")
                return "NOT_FOUND" # Mark as searched but empty

            elif r.status_code in [429, 403]:
                # Check if Apple sent a specific wait time
                wait = int(r.headers.get("Retry-After", backoff_time))
                print(f"\n[Rate Limited] Status {r.status_code}. Waiting {wait}s...")
                time.sleep(wait)
                # Increase backoff for next time, max 10 minutes
                backoff_time = min(backoff_time * 2, 600)
            
            else:
                print(f"\n[Error] Status {r.status_code}. Retrying in 10s...")
                time.sleep(10)

        except Exception as e:
            print(f"\n[Connection Error] {e}. Retrying in 30s...")
            time.sleep(30)

# --- EXECUTION START ---

# Load Data: Resume if output exists, otherwise start fresh
if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)
    print(f"Resuming: Found {len(df)} total rows in {OUTPUT_FILE}.")
else:
    df = pd.read_csv(INPUT_FILE)
    if "cover_url" not in df.columns:
        df["cover_url"] = None
    print(f"Starting fresh: {len(df)} rows loaded.")

# Identify remaining work here
# We process rows where cover_url is NaN (not "NOT_FOUND")
mask = df["cover_url"].isna()
indices_to_process = df[mask].index
print(f"Remaining rows to process: {len(indices_to_process)}")

session = requests.Session()

try:
    for count, idx in enumerate(tqdm(indices_to_process, desc="Fetching covers")):
        row = df.loc[idx]
        song = str(row.get("name", ""))
        artist = parse_artist(row.get("artists", ""))
        
        # Fetch the cover (this stays here until it gets a valid response)
        cover = fetch_itunes_cover_persistent(session, song, artist)
        
        # Update DataFrame
        df.at[idx, "cover_url"] = cover
        
        # Constant delay between successful hits
        time.sleep(SLEEP_BETWEEN_REQUESTS)
        
        # Save every 500 songs
        if (count + 1) % CHECKPOINT_INTERVAL == 0:
            df.to_csv(OUTPUT_FILE, index=False)
            print(f" - Checkpoint: Saved {count + 1} new rows to {OUTPUT_FILE}")

finally:
    # Final save when loop finishes or upon hitting Ctrl+C
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSession finished. Total rows with data: {df['cover_url'].notna().sum()}")

Starting fresh: 1 rows loaded.
Remaining rows to process: 1


Fetching covers: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.49s/it]


Session finished. Total rows with data: 1



