In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime
import logging
import json
from config import SPOTIFY_CONFIG

In [2]:
def extract_show_id(url):
    """Extract show ID from Spotify URL"""
    return url.split('show/')[-1].split('?')[0]

In [4]:
def verify_show_ids():
    # Initialize Spotify client
    auth_manager = SpotifyClientCredentials(
        client_id=SPOTIFY_CONFIG['client_id'],
        client_secret=SPOTIFY_CONFIG['client_secret']
    )
    sp = spotipy.Spotify(auth_manager=auth_manager)
    
    show_ids = [
        "6E1u3kxII5CbbFR4VObax4", 
        "1VXcH8QHkjRcTCEd88U3ti",
        "4fsW5D9rKYycsP2hgKtvCk",
        "5RdShpOtxKO3ZWohR2M6Sv",
        "0ofXAdFIQQRsCYj9754UFx",
        "4rOoJ6Egrf8K2IrywzwOMk",
        "3gaGfrqgnVqUBNDdtv5p3S",
        "7wkYuqWC8z51nfetiZCTbT",
        "2HGcJRYrjGnpce6bRp8UXm",
        "5VzFvh1JlEhBMS6ZHZ8CNO"
    ]
    
    valid_shows = []
    print("\nVerifying Show IDs:")
    print("-" * 50)
    
    for show_id in show_ids:
        try:
            show = sp.show(show_id)
            print(f"✅ Valid: {show['name']}")
            print(f"   ID: {show_id}")
            print(f"   Publisher: {show['publisher']}")
            print(f"   Episodes: {show['total_episodes']}")
            print("-" * 50)
            valid_shows.append(show_id)
        except Exception as e:
            print(f"❌ Invalid: {show_id}")
            print(f"   Error: {str(e)}")
            print("-" * 50)
    
    print(f"\nTotal valid shows: {len(valid_shows)}")
    print("\nValid Show IDs to use in pipeline:")
    print(valid_shows)
    return valid_shows

verify_show_ids()


Verifying Show IDs:
--------------------------------------------------
✅ Valid: The LOL Podcast
   ID: 6E1u3kxII5CbbFR4VObax4
   Publisher: Cash, Maverick, Kate, Harper, Kenzie
   Episodes: 147
--------------------------------------------------
✅ Valid: TED Talks Daily
   ID: 1VXcH8QHkjRcTCEd88U3ti
   Publisher: TED
   Episodes: 2261
--------------------------------------------------
✅ Valid: Murder: True Crime Stories
   ID: 4fsW5D9rKYycsP2hgKtvCk
   Publisher: Crime House
   Episodes: 41
--------------------------------------------------
✅ Valid: Conspiracy Theories
   ID: 5RdShpOtxKO3ZWohR2M6Sv
   Publisher: Spotify Studios
   Episodes: 500
--------------------------------------------------
✅ Valid: Stuff You Should Know
   ID: 0ofXAdFIQQRsCYj9754UFx
   Publisher: iHeartPodcasts
   Episodes: 2472
--------------------------------------------------
✅ Valid: The Joe Rogan Experience
   ID: 4rOoJ6Egrf8K2IrywzwOMk
   Publisher: Joe Rogan
   Episodes: 2428
-------------------------------

['6E1u3kxII5CbbFR4VObax4',
 '1VXcH8QHkjRcTCEd88U3ti',
 '4fsW5D9rKYycsP2hgKtvCk',
 '5RdShpOtxKO3ZWohR2M6Sv',
 '0ofXAdFIQQRsCYj9754UFx',
 '4rOoJ6Egrf8K2IrywzwOMk',
 '3gaGfrqgnVqUBNDdtv5p3S',
 '7wkYuqWC8z51nfetiZCTbT',
 '2HGcJRYrjGnpce6bRp8UXm',
 '5VzFvh1JlEhBMS6ZHZ8CNO']

In [13]:
import data
import pandas as pd

df = pd.read_csv('data/podcast_data_20250110.csv')

df.head(10)


Unnamed: 0,episode_id,name,duration_ms,release_date,description,explicit,language
0,6HHnn1Qu1fjnCAk3HIU4tX,Calling Out The Sturniolo Triplets!,3405077,2025-01-08,We called out the Sturniolo triplets!,False,en
1,0kxqrgg3mzE5Y80rFaaTgs,Kate Got A Tattoo!,3382848,2025-01-04,Kate got a secret tatto,False,en
2,5ldiur073e35gGpRLBZHzP,Harper Dyed Her Hair!,3192960,2025-01-01,Harper dyed her hair brown!,False,en
3,2dbxwiT6nzJX81xJRs2b7n,Kate’s First Heartbreak!,3614208,2024-12-28,We talked about our first heartbreaks!,False,en
4,1G4fqOfyUTbIdF2MepUEnS,Revealing Our Baby Names!,4705173,2024-12-25,"Everyone reveals their baby name list, then Ca...",False,en
5,3cWoEO4VcJK3erNEUT1Iop,Cash And Kenzie Kissed!,4387903,2024-12-21,We swapped wife's for a day!,False,en
6,2ZzyjmaH3W78u8GXfiGs2f,Cash Raged And Broke Harper’s Phone!,3671103,2024-12-18,We pranked Harper on the pod!,False,en
7,5e5Nsg2PWGI4A3R9xTT2wS,Giving Kate Sabrina Carpenter Tickets!,3641471,2024-12-14,We gave Kate fake Sabrina Carpenter tickets!,False,en
8,1hT1Ssfqw2LlIxShVaP4Pq,Harper Got Sick On Stage!,4154901,2024-12-11,We talk about our first ever live show!,False,en
9,4dImz7YY4Al4FHCBGzKmwg,My Most Embarrassing Moment!,3622165,2024-12-07,We talk about Harper’s most embarrassing momen...,False,en


In [9]:
print(df.shape)
print(df.info())
print(df.describe())


(8911, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8911 entries, 0 to 8910
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   episode_id    8911 non-null   object
 1   name          8911 non-null   object
 2   duration_ms   8911 non-null   int64 
 3   release_date  8911 non-null   object
 4   description   8911 non-null   object
 5   explicit      8911 non-null   bool  
 6   language      8911 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 426.5+ KB
None
        duration_ms
count  8.911000e+03
mean   3.955916e+06
std    3.652868e+06
min    4.708200e+04
25%    9.819815e+05
50%    2.589953e+06
75%    6.286566e+06
max    1.989385e+07


In [11]:
#change the date column to datetime
df['release_date'] = pd.to_datetime(df['release_date'])

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8911 entries, 0 to 8910
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   episode_id    8911 non-null   object        
 1   name          8911 non-null   object        
 2   duration_ms   8911 non-null   int64         
 3   release_date  8911 non-null   datetime64[ns]
 4   description   8911 non-null   object        
 5   explicit      8911 non-null   bool          
 6   language      8911 non-null   object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(4)
memory usage: 426.5+ KB
None


In [14]:
df.to_csv('data/podcast_data_cleaned.csv', index=False)