In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
import ast
import os

INPUT_FILE = "../dataset/chunks/testing.csv"
OUTPUT_FILE = "songs_fetched_part_1.csv"
CHECKPOINT_INTERVAL = 100 
SLEEP_BETWEEN_REQUESTS = 1.0  # Base delay

def parse_artist(artists_str):
    if not artists_str or pd.isna(artists_str):
        return ""
    try:
        parsed = ast.literal_eval(artists_str)
        if isinstance(parsed, list) and parsed:
            return str(parsed[0])
    except Exception:
        pass
    return str(artists_str)

def fetch_itunes_cover_persistent(session, song_name, artist_name):
    """
    Attempts to fetch cover. If rate limited, waits until 200 is received.
    Returns the URL or None only if the search genuinely has 0 results.
    """
    backoff_time = 30  # Start with 30s if we hit an error
    
    while True:
        try:
            params = {"term": f"{song_name} {artist_name}", "media": "music", "entity": "song", "limit": 1}
            r = session.get("https://itunes.apple.com/search", params=params, timeout=15)
            
            if r.status_code == 200:
                data = r.json()
                results = data.get("results", [])
                if results:
                    return results[0].get("artworkUrl100")
                return None # Genuinely not found

            elif r.status_code in [429, 403]:
                print(f"\n[Rate Limited] Status {r.status_code}. Waiting {backoff_time}s...")
                time.sleep(backoff_time)
                backoff_time = min(backoff_time * 2, 600) # Max wait 10 mins
            
            else:
                print(f"\n[Error] Status {r.status_code} for {song_name}. Retrying in 10s...")
                time.sleep(10)

        except (requests.RequestException, Exception) as e:
            print(f"\n[Connection Error] {e}. Retrying in 30s...")
            time.sleep(30)

# Load Progress
if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)
    print(f"Resuming from existing output file with {len(df)} rows.")
else:
    df = pd.read_csv(INPUT_FILE)
    df["cover_url"] = None

session = requests.Session()

# Process only rows where cover_url is still None/NaN
mask = df["cover_url"].isna()
indices_to_process = df[mask].index

print(f"Total rows to process: {len(indices_to_process)}")

try:
    for count, idx in enumerate(tqdm(indices_to_process, desc="Fetching covers")):
        row = df.loc[idx]
        song = row.get("name")
        artist = parse_artist(row.get("artists", ""))
        
        cover = fetch_itunes_cover_persistent(session, song, artist)
        df.at[idx, "cover_url"] = cover if cover else "NOT_FOUND"
        
        # Throttling
        time.sleep(SLEEP_BETWEEN_REQUESTS)
        
        # Periodic Save
        if (count + 1) % CHECKPOINT_INTERVAL == 0:
            df.to_csv(OUTPUT_FILE, index=False)

finally:
    # Final save on completion or manual stop
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nProcess finished. Saved to {OUTPUT_FILE}")

In [3]:
import pandas as pd
import requests
import time
from tqdm import tqdm
import ast
import os

INPUT_FILE = "../dataset/chunks/testing.csv"
OUTPUT_FILE = "songs_fetched_part_1.csv"
CHECKPOINT_INTERVAL = 500  # Updated to 500
SLEEP_BETWEEN_REQUESTS = 1.0 

def parse_artist(artists_str):
    if not artists_str or pd.isna(artists_str):
        return ""
    try:
        # Handles list strings like "['Artist Name']"
        parsed = ast.literal_eval(artists_str)
        if isinstance(parsed, list) and parsed:
            return str(parsed[0])
    except Exception:
        pass
    return str(artists_str)

def fetch_itunes_cover_persistent(session, song_name, artist_name):
    """
    Persistent fetcher: Only returns on 200 (Success) or 200 (Empty Results).
    Retries infinitely on 429, 403, or connection errors.
    """
    backoff_time = 30 
    
    while True:
        try:
            params = {"term": f"{song_name} {artist_name}", "media": "music", "entity": "song", "limit": 1}
            r = session.get("https://itunes.apple.com/search", params=params, timeout=15)
            
            if r.status_code == 200:
                data = r.json()
                results = data.get("results", [])
                if results:
                    return results[0].get("artworkUrl100")
                return "NOT_FOUND" # Mark as searched but empty

            elif r.status_code in [429, 403]:
                # Check if Apple sent a specific wait time
                wait = int(r.headers.get("Retry-After", backoff_time))
                print(f"\n[Rate Limited] Status {r.status_code}. Waiting {wait}s...")
                time.sleep(wait)
                # Increase backoff for next time, max 10 minutes
                backoff_time = min(backoff_time * 2, 600)
            
            else:
                print(f"\n[Error] Status {r.status_code}. Retrying in 10s...")
                time.sleep(10)

        except Exception as e:
            print(f"\n[Connection Error] {e}. Retrying in 30s...")
            time.sleep(30)

# --- EXECUTION START ---

# 1. Load Data: Resume if output exists, otherwise start fresh
if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)
    print(f"Resuming: Found {len(df)} total rows in {OUTPUT_FILE}.")
else:
    df = pd.read_csv(INPUT_FILE)
    if "cover_url" not in df.columns:
        df["cover_url"] = None
    print(f"Starting fresh: {len(df)} rows loaded.")

# 2. Identify remaining work
# We process rows where cover_url is NaN (not "NOT_FOUND")
mask = df["cover_url"].isna()
indices_to_process = df[mask].index
print(f"Remaining rows to process: {len(indices_to_process)}")

session = requests.Session()

try:
    for count, idx in enumerate(tqdm(indices_to_process, desc="Fetching covers")):
        row = df.loc[idx]
        song = str(row.get("name", ""))
        artist = parse_artist(row.get("artists", ""))
        
        # Fetch the cover (this stays here until it gets a valid response)
        cover = fetch_itunes_cover_persistent(session, song, artist)
        
        # Update DataFrame
        df.at[idx, "cover_url"] = cover
        
        # Constant delay between successful hits
        time.sleep(SLEEP_BETWEEN_REQUESTS)
        
        # 3. Save every 500 songs
        if (count + 1) % CHECKPOINT_INTERVAL == 0:
            df.to_csv(OUTPUT_FILE, index=False)
            print(f" - Checkpoint: Saved {count + 1} new rows to {OUTPUT_FILE}")

finally:
    # Final save when loop finishes or if you hit Ctrl+C
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSession finished. Total rows with data: {df['cover_url'].notna().sum()}")

Starting fresh: 700 rows loaded.
Remaining rows to process: 700


Fetching covers:   6%|███████▊                                                                                                                                 | 40/700 [00:54<14:58,  1.36s/it]


[Rate Limited] Status 429. Waiting 17s...


Fetching covers:  10%|██████████████▎                                                                                                                          | 73/700 [01:57<14:13,  1.36s/it]


[Rate Limited] Status 429. Waiting 13s...


Fetching covers:  16%|█████████████████████▏                                                                                                                  | 109/700 [03:00<13:27,  1.37s/it]


[Rate Limited] Status 429. Waiting 11s...


Fetching covers:  26%|███████████████████████████████████▋                                                                                                    | 184/700 [04:54<11:41,  1.36s/it]


[Rate Limited] Status 429. Waiting 17s...


Fetching covers:  31%|██████████████████████████████████████████▌                                                                                             | 219/700 [05:59<11:03,  1.38s/it]


[Rate Limited] Status 429. Waiting 11s...


Fetching covers:  36%|████████████████████████████████████████████████▍                                                                                       | 249/700 [06:52<10:24,  1.38s/it]


[Rate Limited] Status 429. Waiting 19s...


Fetching covers:  40%|██████████████████████████████████████████████████████▊                                                                                 | 282/700 [07:56<09:41,  1.39s/it]


[Rate Limited] Status 429. Waiting 14s...


Fetching covers:  44%|████████████████████████████████████████████████████████████▏                                                                           | 310/700 [08:49<08:48,  1.35s/it]


[Rate Limited] Status 429. Waiting 21s...


Fetching covers:  50%|███████████████████████████████████████████████████████████████████▍                                                                    | 347/700 [10:01<08:04,  1.37s/it]


[Rate Limited] Status 429. Waiting 10s...


Fetching covers:  55%|██████████████████████████████████████████████████████████████████████████▍                                                             | 383/700 [11:00<07:06,  1.35s/it]


[Rate Limited] Status 429. Waiting 10s...


Fetching covers:  60%|█████████████████████████████████████████████████████████████████████████████████▊                                                      | 421/700 [12:03<06:21,  1.37s/it]


[Rate Limited] Status 429. Waiting 7s...


Fetching covers:  66%|█████████████████████████████████████████████████████████████████████████████████████████▌                                              | 461/700 [13:05<05:26,  1.36s/it]


[Rate Limited] Status 429. Waiting 5s...


Fetching covers:  71%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 497/700 [14:00<04:39,  1.38s/it]


[Rate Limited] Status 429. Waiting 10s...


Fetching covers:  71%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 500/700 [14:14<09:36,  2.88s/it]

 - Checkpoint: Saved 500 new rows to songs_fetched_part_1.csv


Fetching covers:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 537/700 [15:05<03:42,  1.37s/it]


[Rate Limited] Status 429. Waiting 6s...


Fetching covers:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 578/700 [16:07<02:45,  1.36s/it]


[Rate Limited] Status 429. Waiting 3s...


Fetching covers:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 612/700 [16:57<01:59,  1.35s/it]


[Rate Limited] Status 429. Waiting 14s...


Fetching covers:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 653/700 [18:07<01:05,  1.39s/it]


[Rate Limited] Status 429. Waiting 3s...


Fetching covers:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 680/700 [18:48<00:27,  1.38s/it]


[Rate Limited] Status 429. Waiting 23s...


Fetching covers: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 700/700 [19:39<00:00,  1.68s/it]


Session finished. Total rows with data: 700



