In [None]:
import pandas as pd
import numpy as np
import feedparser
from datetime import datetime
from dateutil import parser
import time

In [None]:
metadata = pd.read_csv("metadata.tsv", sep= "\t")
metadata.head()

In [None]:
# Add new columns to the metadata DataFrame
metadata['category'] = None
metadata['pubdate'] = None

# Loop through each row in the metadata DataFrame
for index, row in metadata.iterrows():
    # Extract the episode_name and episode_uri from the current row
    episode_name = row['episode_name']
    rss_link = row['rss_link']

    # Try to fetch the RSS feed from the episode_uri, and retry up to 3 times if there's an error
    for attempt in range(5):
        try:
            feed = feedparser.parse(rss_link)
            break
        except Exception as e:
            print(f"Error fetching RSS feed for {episode_name}: {e}")
            print(f"Retrying in 20 seconds... (Attempt {attempt+1}/5)")
            time.sleep(20)

    # If the maximum number of retries is reached, skip to the next row
    else:
        print(f"Max number of retries reached for {episode_name}. Skipping...")
        continue

    # Extract the category from the RSS feed's <channel> element
    try:
        channel = feed.feed
        category = channel.tags[0].get('term')
    except AttributeError:
        category = None

    # Loop through each item in the RSS feed
    for item in feed.entries:
        # Check if the current item's title matches the episode_name
        if item.title == episode_name:
            # Extract pubdate from the current item and convert to YYYY-MM-DD format
            pubdate = parser.parse(item.published).date()

            # Update the current row with the fetched data
            metadata.at[index, 'category'] = category
            metadata.at[index, 'pubdate'] = pubdate
            

metadata.head(5)


In [None]:
# Save new metadata to file
metadata.to_csv('metadata.csv.gz', compression='gzip', index=False)

In [48]:
metadata = pd.read_csv('metadata.csv.gz', compression='gzip')
metadata.head()

Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,category,pubdate
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,Leisure,2019-12-18
1,spotify:show:15iWCbU7QoO23EndPEO6aN,Morning Cup Of Murder,Ever wonder what murder took place on today in...,Morning Cup Of Murder,['en'],https://anchor.fm/s/b07181c/podcast/rss,spotify:episode:000HP8n3hNIfglT2wSI2cA,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma...",6.019383,show_15iWCbU7QoO23EndPEO6aN,000HP8n3hNIfglT2wSI2cA,,
2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,['en'],https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,Sports,2019-01-18
3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,['en-US'],https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.1892,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,Sports,2019-03-01
4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,['en'],https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.78205,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,,


In [None]:
metadata_err = metadata[metadata.category.isna()]

In [None]:
metadata_err.shape

In [None]:
# Loop through each row in the metadata DataFrame
for index, row in metadata_err.iterrows():
    # Extract the episode_name and episode_uri from the current row
    episode_name = row['episode_name']
    rss_link = row['rss_link']

    # Try to fetch the RSS feed from the episode_uri, and retry up to 3 times if there's an error
    for attempt in range(5):
        try:
            feed = feedparser.parse(rss_link)
            break
        except Exception as e:
            print(f"Error fetching RSS feed for {episode_name}: {e}")
            print(f"Retrying in 20 seconds... (Attempt {attempt+1}/5)")
            time.sleep(20)

    # If the maximum number of retries is reached, skip to the next row
    else:
        print(f"Max number of retries reached for {episode_name}. Skipping...")
        continue

    # Extract the category from the RSS feed's <channel> element
    try:
        channel = feed.feed
        category = channel.tags[0].get('term')
        # print(category)
    except AttributeError:
        category = None

    # Loop through each item in the RSS feed
    for item in feed.entries:
        # Check if the current item's title matches the episode_name
        if item.title == episode_name:
            # Extract pubdate from the current item and convert to YYYY-MM-DD format
            pubdate = parser.parse(item.published).date()
            # print(pubdate)

            # Update the current row with the fetched data
            metadata_err.at[index, 'category'] = category
            metadata_err.at[index, 'pubdate'] = pubdate
            

metadata_err.head(20)

In [None]:
metadata_err.isna().sum()