In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("music_dataset.csv")

In [10]:
# Meta Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [11]:
print(df.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [5]:
df = df[df['popularity'] != 0]  

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Load filtered track IDs (assumes `df` is already defined and loaded)
df_filtered = df.iloc[0:] 
all_ids = df_filtered['track_id'].dropna().unique()

# Output file path
output_path = 'spotify_release_dates_merged.txt'
processed_ids = set()

# Load already processed IDs
try:
    with open(output_path, 'r') as f:
        for line in f:
            tid = line.strip().split(' - ')[0]
            processed_ids.add(tid)
except FileNotFoundError:
    pass  # If file doesn't exist, start fresh

# Filter unprocessed track IDs and limit the batch
to_process = [tid for tid in all_ids if tid not in processed_ids][:100]
print(f"Found {len(to_process)} new track IDs to process...")

# Scrape release dates and append to file
with open(output_path, 'a') as f:
    for i, track_id in enumerate(to_process):
        try:
            url = f'https://open.spotify.com/track/{track_id}'
            headers = {'User-Agent': 'Mozilla/5.0'}
            res = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(res.text, 'html.parser')
            meta = soup.find('meta', {'name': 'music:release_date'})

            if meta and meta.get('content'):
                release_date = meta['content']
                f.write(f"{track_id} - {release_date}\n")
                print(f"{i+1}/{len(to_process)}. {track_id} → {release_date}")
            else:
                f.write(f"{track_id} - NOT FOUND\n")
                print(f"{i+1}/{len(to_process)}. {track_id} → No release date found") 
        except Exception as e:
            f.write(f"{track_id} - ERROR\n")
            print(f"{i+1}/{len(to_process)}. {track_id} → GENERAL ERROR: {e}")

        time.sleep(0.25)  # Be respectful to Spotify's servers

print(f"\nDone with this batch! Results saved to: {output_path}") 

Found 100 new track IDs to process...
1/100. 5SuOikwiRyPMVoIQDJUgSV → 2022-04-08
2/100. 4qPNDBW1i3p13qLCt0Ki3A → 2021-04-30
3/100. 1iJBSr7s7jYXzM8EGcbK5b → 2021-03-17
4/100. 6lfxq3CG4xtTiEg7opyCyx → 2018-08-10
5/100. 5vjLSffimiIP26QG5WcN2K → 2017-02-03
6/100. 01MVOl9KtVTNfFiBU9I7dc → 2018-04-20
7/100. 6Vc5wAMmXdKIAM7WUoEb7N → 2014-01-20
8/100. 1EzrEOXmMH3G43AXT1y7pA → 2008-05-12
9/100. 0IktbUcnAGrvD03AWnz3Q8 → 2008-05-12
10/100. 7k9GuJYLp2AzqokyEdwEw2 → 2015-04-21
11/100. 4mzP5mHkRvGxdhdGdAH7EJ → 2021-10-15
12/100. 5ivF4eQBqJiVL5IAE9jRyl → 2012-04-13
13/100. 4ptDJbJl35d7gQfeNteBwp → 2018-06-15
14/100. 0X9MxHR1rTkEHDjp95F2OO → 2018-12-14
15/100. 4LbWtBkN82ZRhz9jqzgrb3 → 2017-05-19
16/100. 1KHdq8NK9QxnGjdXb55NiG → 2009-09-24
17/100. 6xKeQgzfjixSUld14qUezm → 2020-07-09
18/100. 4Yo0igmcoNyat1secaH0OD → 2020-12-31
19/100. 2qLMf6TuEC3ruGJg4SMMN6 → 2008-05-01
20/100. 6CgNoAbFJ4Q4Id4EjtbXlC → 2017-12-19
21/100. 3S0OXQeoh0w6AY8WQVckRW → 2008-05-01
22/100. 210JCw2LbYD4YIs8GiZ9iP → 2015-04-28
23/

In [None]:
# Path to your output file
output_path = 'spotify_release_dates_merged.txt'

# Count how many lines (observations) are in the file
with open(output_path, 'r') as f:
    num_lines = sum(1 for _ in f)

print(f"Progress: {num_lines} release dates collected so far.")

Progress: 100 release dates collected so far.
