
## This code collects track data from the Deezer API, filters it by release year (2000–2025), and saves it to a CSV file. It handles resuming from the last processed track and ensures no duplicates by checking existing track IDs.

- **Setup:** constants like the target number of tracks and file paths are defined
- **Load Existing Data:** If a dataset exists, it is loaded if not a new df is made
- **Track Data Fetching:** The code fetches track details from the Deezer API and filters by year
- **Data Saving:** Every 1000 valid tracks, the data is saved to the CSV file, it also resumes from the last processed track using a saved ID

In [None]:
import requests
import pandas as pd
import time
import os


try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))  # .py
except NameError:
    SCRIPT_DIR = os.getcwd()  

BASE_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, "../../Data/Raw"))
os.makedirs(BASE_DIR, exist_ok=True)  

# Constants
START_ID = 1_305_200
VALID_TRACKS_TARGET = 600_000
TRACKS_TO_SAVE_EVERY = 1000
SLEEP_TIME = 0.20
LAST_PROCESSED_ID_FILE = os.path.join(BASE_DIR, "last_processed_id.txt")
csv_file = os.path.join(BASE_DIR, "deezer_tracks_super_dataset1.csv")

YEAR_MIN = 2000
YEAR_MAX = 2025

#Load Existing data
if os.path.exists(csv_file):
    existing_df = pd.read_csv(csv_file, low_memory=False)
    collected_so_far = len(existing_df)
    print(f"Loaded existing dataset with {collected_so_far} tracks.")
    
    if "id" in existing_df.columns:
        existing_ids = set(existing_df["id"])
    else:
        print("Warning: 'id' column not found. Deduplication will use title/artist/album.")
        existing_ids = set()
else:
    existing_df = pd.DataFrame()
    collected_so_far = 0
    existing_ids = set()

#Fetch Track from Deezer 
def fetch_track(track_id):
    url = f"https://api.deezer.com/track/{track_id}"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if "error" in data:
                return None
            return {
                "id": data.get("id"),
                "title": data.get("title"),
                "artist": data["artist"]["name"] if "artist" in data else None,
                "album": data["album"]["title"] if "album" in data else None,
                "duration": data.get("duration"),
                "rank": data.get("rank"),
                "explicit_lyrics": data.get("explicit_lyrics"),
                "release_date": data.get("release_date"),
                "isrc": data.get("isrc"),
                "deezer_link": f"https://www.deezer.com/track/{data['id']}"
            }
    except Exception as e:
        print(f"Error fetching track {track_id}: {e}")
    return None

#Resume from Last Track
if os.path.exists(LAST_PROCESSED_ID_FILE):
    with open(LAST_PROCESSED_ID_FILE, 'r') as file:
        track_id = int(file.read().strip())
        print(f"Resuming from track ID: {track_id}")
else:
    track_id = START_ID

valid_tracks = []

# Loop 
while collected_so_far + len(valid_tracks) < VALID_TRACKS_TARGET:
    if track_id in existing_ids:
        track_id += 1
        continue

    track_data = fetch_track(track_id)
    if track_data:
        release_year = None
        if track_data.get("release_date"):
            try:
                release_year = int(track_data["release_date"][:4])
            except:
                pass

        if release_year and YEAR_MIN <= release_year <= YEAR_MAX:
            valid_tracks.append(track_data)
        else:
            print(f"Skipped track {track_id} (year {release_year}) — not in {YEAR_MIN}–{YEAR_MAX}")
    else:
        print(f"Invalid or missing track ID: {track_id}")

    if len(valid_tracks) % TRACKS_TO_SAVE_EVERY == 0 and valid_tracks:
        new_df = pd.DataFrame(valid_tracks)
        if not existing_df.empty:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            combined_df.drop_duplicates(subset=["id"], inplace=True)
        else:
            combined_df = new_df.drop_duplicates(subset=["id"])
        combined_df.to_csv(csv_file, index=False)
        print(f"Saved {len(combined_df)} total unique track.")
        existing_df = combined_df.copy()
        existing_ids = set(existing_df["id"])
        valid_tracks = []

        # Save current ID
        with open(LAST_PROCESSED_ID_FILE, 'w') as file:
            file.write(str(track_id))

    track_id += 1
    time.sleep(SLEEP_TIME)

#  Save
if valid_tracks:
    new_df = pd.DataFrame(valid_tracks)
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    combined_df.drop_duplicates(subset=["id"], inplace=True)
    combined_df.to_csv(csv_file, index=False)
    print(f"Final save: {len(combined_df)} total unique tracks.")
else:
    print("No new tracks collected")


## This code cleans and saves Deezer track data while handling missing data and API limits. 
- **Data Cleaning**: Handles missing metadata by re-fetching and skips invalid tracks
- **Concurrency**: Uses parallel fetching for faster processing.
- **Rate Limiting**: Implements exponential backoff on quota hits


In [None]:
import os
import time
import requests
import pandas as pd
import concurrent.futures
from tqdm import tqdm

# Constants and directories
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../../Data/Raw"))  
PROCESSED_DIR = os.path.abspath(os.path.join(os.getcwd(), "../../Data/Processed"))  
SCRAPER_DIR = os.getcwd()

START_ID = 1_305_200
VALID_TRACKS_TARGET = 600_000
TRACKS_TO_SAVE_EVERY = 1000
SLEEP_TIME = 0.15

LAST_PROCESSED_ID_FILE = os.path.join(BASE_DIR, "last_processed_id.txt")
CLEANING_DONE_FILE = os.path.join(BASE_DIR, "cleaning_done.txt")
CSV_FILE = os.path.join(BASE_DIR, "deezer_tracks_super_dataset1.csv")

YEAR_MIN = 2000
YEAR_MAX = 2025

#  Skip cleaning if already done 
SKIP_CLEANING = os.path.exists(CLEANING_DONE_FILE)

# Load existing dataset 
if os.path.exists(CSV_FILE):
    existing_df = pd.read_csv(CSV_FILE, low_memory=False)
    collected_so_far = len(existing_df)
    print(f"Found existing dataset with {collected_so_far} tracks.")

    if "id" in existing_df.columns:
        existing_ids = set(existing_df["id"])
    else:
        print("Warning: 'id' column not found. Deduplication will fallback.")
        existing_ids = set()
else:
    existing_df = pd.DataFrame()
    collected_so_far = 0
    existing_ids = set()

def extract_year(date_str):
    try:
        year = int(str(date_str)[:4])
        if YEAR_MIN <= year <= YEAR_MAX:
            return year
    except:
        return None
    return None

#  Fetch track with retry for quota-hit
def fetch_track(track_id, retries=3):
    if pd.isna(track_id):
        print(f"Skipping invalid track ID: {track_id}")
        return None

    url = f"https://api.deezer.com/track/{track_id}"
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=5)

            if response.status_code == 200:
                data = response.json()
                if "error" in data:
                    if "Quota limit exceeded" in data["error"].get("message", ""):
                        retry_after = response.headers.get("Retry-After")
                        if retry_after:
                            # Use the retry after value in seconds
                            wait_time = int(retry_after)
                            print(f"Quota hit, waiting {wait_time} seconds to retry")
                            time.sleep(wait_time)
                        else:
                            # Fallback strategy 
                            wait_time = 60 * (2 ** attempt)  # Exponential backoff
                            print(f"Quota hit, waiting {wait_time} seconds to retry (attempt {attempt+1})")
                            time.sleep(wait_time)
                        return "quota_hit"
                    print(f"Track {track_id} skipped due to API error.")
                    return None
                return {
                    "id": data.get("id"),
                    "title": data.get("title"),
                    "artist": data["artist"]["name"] if "artist" in data else "Unknown",
                    "album": data["album"]["title"] if "album" in data else "Unknown",
                    "duration": data.get("duration", 0),
                    "rank": data.get("rank", 0),
                    "explicit_lyrics": data.get("explicit_lyrics", False),
                    "release_date": data.get("release_date", "Unknown"),
                    "isrc": data.get("isrc", "Unknown"),
                    "deezer_link": f"https://www.deezer.com/track/{data['id']}"
                }

            elif response.status_code == 403:
                return "quota_hit"

            else:
                print(f"Error fetching track {track_id}: HTTP {response.status_code}")
                return None

        except Exception as e:
            print(f"Error fetching track {track_id}: {e}")
            time.sleep(2 ** attempt)

    return None

#  Cleaning logic with retry 
def clean_existing_rows(df):
    required_fields = ['title', 'release_date', 'isrc']
    missing_rows = df[df[required_fields].isnull().any(axis=1)]
    print(f"Found {len(missing_rows)} tracks with missing values to clean.")
    for idx, row in missing_rows.iterrows():
        missing_fields = [field for field in required_fields if pd.isna(row[field])]
        print(f"Track ID {row['id']} is missing: {', '.join(missing_fields)}")

    cleaned_batches = []
    cleaned_count = 0
    skipped_count = 0
    dropped_count = 0
    quota_retry_list = []
    saved_count = 0
    skipped_tracks_log = []

    def process_rows(rows_to_clean):
        nonlocal cleaned_batches, cleaned_count, skipped_count, saved_count, quota_retry_list

        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            future_to_row = {executor.submit(fetch_track, row['id']): row for _, row in rows_to_clean.iterrows()}
            for future in tqdm(concurrent.futures.as_completed(future_to_row), total=len(future_to_row), desc="Cleaning tracks", unit="track"):
                row = future_to_row[future]
                result = future.result()

                if result == "quota_hit":
                    quota_retry_list.append(row)
                    continue

                if result:
                    updated_row = row.copy()
                    for key, value in result.items():
                        if pd.isna(updated_row.get(key)) and value is not None:
                            updated_row[key] = value
                    cleaned_batches.append(updated_row)
                    cleaned_count += 1
                    saved_count += 1

                    if saved_count % TRACKS_TO_SAVE_EVERY == 0:
                        cleaned_df = pd.DataFrame(cleaned_batches)
                        existing_df = pd.concat([df[~df.index.isin(missing_rows.index)], cleaned_df], ignore_index=True)
                        existing_df.drop_duplicates(subset=["id"], inplace=True)
                        existing_df.to_csv(CSV_FILE, index=False)
                        print(f"Saved cleaned dataset with {len(existing_df)} total unique tracks (saved {saved_count} tracks so far)")
                        cleaned_batches = []
                else:
                    skipped_count += 1
                    skipped_tracks_log.append(f"Track {row['id']} skipped")

    process_rows(missing_rows)

    # Retry if quota was hit 
    if quota_retry_list:
        print(f"Quota hit, waiting 60 minutes to retry {len(quota_retry_list)} tracks")
        time.sleep(3600)
        quota_df = pd.DataFrame(quota_retry_list)
        quota_retry_list.clear()
        process_rows(quota_df)

    cleaned_df = pd.DataFrame(cleaned_batches)
    combined_df = pd.concat([df[~df.index.isin(missing_rows.index)], cleaned_df], ignore_index=True)

    required_columns = ['id', 'title', 'release_date']
    before_drop = len(combined_df)
    combined_df.dropna(subset=required_columns, inplace=True)
    after_drop = len(combined_df)
    dropped_count = before_drop - after_drop

    print(f"Cleaning summary:")
    print(f"Cleaned and updated: {cleaned_count}")
    print(f"Skipped (not found or invalid): {skipped_count}")
    print(f"Dropped rows with missing critical data: {dropped_count}")

    if skipped_tracks_log:
        with open("skipped_tracks_log.txt", "w") as log_file:
            for log in skipped_tracks_log:
                log_file.write(log + "\n")
        print("Skipped track details saved to 'skipped_tracks_log.txt'")

    return combined_df

if collected_so_far > 0:
    if not os.path.exists(CLEANING_DONE_FILE):
        print("Cleaning tracks")
        cleaned_df = clean_existing_rows(existing_df)

        if not cleaned_df.empty:
            existing_df = cleaned_df
            existing_df.drop_duplicates(subset=["id"], inplace=True)
            existing_df.to_csv(CSV_FILE, index=False)
            print(f"Saved cleaned dataset with {len(existing_df)} total unique tracks")

            with open(CLEANING_DONE_FILE, "w") as done_file:
                done_file.write("done")
        else:
            print("No cleaned data to save.")
    else:
        print("Skipping cleaning simce already done.")
else:
    print("No existing data to clean.")


In [None]:
# Optional: Reset cleaning 
if os.path.exists(CLEANING_DONE_FILE):
    os.remove(CLEANING_DONE_FILE)
    print("cleaning_done.txt deleted.")


## Final polish for cleaned dataset, kept the original deezer_tracks_super_dataset1 and then made a copy and  renamed it to deezer_tracks_2000_2025.csv 


In [None]:
import pandas as pd
import os

# Constants for directory
PROCESSED_DIR = os.path.abspath(os.path.join(os.getcwd(), "../../Data/Processed"))

# File paths
csv_file = os.path.join(PROCESSED_DIR, "deezer_tracks_2000_2025.csv")
final_csv = os.path.join(PROCESSED_DIR, "deezer_tracks_2000_2025.csv")

df = pd.read_csv(csv_file, low_memory=False)

# Drop 'source' column 
if "source" in df.columns:
    df.drop(columns=["source"], inplace=True)

# Convert 'id' to Int64 (nullable int type)
df["id"] = df["id"].astype("Int64")

df["deezer_link"] = df["id"].apply(lambda x: f"https://www.deezer.com/track/{x}" if pd.notna(x) else "")

desired_order = [
    "title", "artist", "album", "duration", "deezer_link", "rank",
    "explicit_lyrics", "release_date", "id", "isrc"
]

final_columns = [col for col in desired_order if col in df.columns]
df = df[final_columns]

# Save to file
df.to_csv(final_csv, index=False)
print(f"Polished dataset saved as {final_csv} with {len(df)} tracks.")


## Filtering by year again to be sure

In [None]:
import pandas as pd
import os

# Constants for directory
PROCESSED_DIR = os.path.abspath(os.path.join(os.getcwd(), "../../Data/Processed"))

csv_file = os.path.join(PROCESSED_DIR, "deezer_tracks_2000_2025.csv")
final_csv = os.path.join(PROCESSED_DIR, "deezer_tracks_2000_2025.csv")

# Read the CSV file
df = pd.read_csv(csv_file, low_memory=False)

df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# Extract release year
df["release_year"] = df["release_date"].dt.year

# Filter for years between 2000 and 2025 (inclusive)
df_filtered = df[(df["release_year"] >= 2000) & (df["release_year"] <= 2025)].copy()

df_filtered.drop(columns=["release_year"], inplace=True)

# Save filtered dataset
df_filtered.to_csv(final_csv, index=False)

print(f"dataset saved as {final_csv} with {len(df_filtered)} tracks from 2000 to 2025.")
