# Access RSS feeds and retrive more features

In [1]:
import pandas as pd
import numpy as np
import feedparser
from datetime import datetime
from dateutil import parser
import time

In [5]:
metadata = pd.read_csv('/Users/ramon.habtezghi/Desktop/THESIS/REPO/Thesis/metadata.csv.gz', compression='gzip')
metadata.isna().sum()

show_uri                       0
show_name                      0
show_description               2
publisher                      0
language                       0
rss_link                       0
episode_uri                    0
episode_name                   0
episode_description          205
duration                       0
show_filename_prefix           0
episode_filename_prefix        0
category                   30276
pubdate                    30266
dtype: int64

In [11]:
metadata_err = metadata[metadata.category.isna()]
metadata_err.shape

(30276, 14)

In [12]:
# Loop through each row in the metadata DataFrame
for index, row in metadata_err.iterrows():
    # Extract the episode_name and episode_uri from the current row
    episode_name = row['episode_name'].strip()
    rss_link = row['rss_link']

    # Try to fetch the RSS feed from the episode_uri, and retry up to 3 times if there's an error
    for attempt in range(5):
        try:
            feed = feedparser.parse(rss_link)
            break
        except Exception as e:
            print(f"Error fetching RSS feed for {episode_name}: {e}")
            print(f"Retrying in 20 seconds... (Attempt {attempt+1}/5)")
            time.sleep(20)

    # If the maximum number of retries is reached, skip to the next row
    else:
        print(f"Max number of retries reached for {episode_name}. Skipping...")
        continue

    # Extract the category from the RSS feed's <channel> element
    try:
        channel = feed.feed
        category = channel.tags[0].get('term')
        # Update the current row with the fetched data
        metadata_err.at[index, 'category'] = category
        # print(category)
    except AttributeError:
        category = None
        continue

    # Loop through each item in the RSS feed
    for item in feed.entries:
        # Check if the current item's title matches the episode_name
        if item.title == episode_name:
            # Extract pubdate from the current item and convert to YYYY-MM-DD format
            pubdate = parser.parse(item.published).date()
            # print(pubdate)

            # Update the current row with the fetched data
            metadata_err.at[index, 'pubdate'] = pubdate            

metadata_err.isna().sum()

show_uri                       0
show_name                      0
show_description               0
publisher                      0
language                       0
rss_link                       0
episode_uri                    0
episode_name                   0
episode_description           65
duration                       0
show_filename_prefix           0
episode_filename_prefix        0
category                    6000
pubdate                    20030
dtype: int64

In [14]:
print(metadata_err.shape)
print(metadata.shape)

(30276, 14)
(105360, 14)


In [18]:
list(metadata_err.columns)

['show_uri',
 'show_name',
 'show_description',
 'publisher',
 'language',
 'rss_link',
 'episode_uri',
 'episode_name',
 'episode_description',
 'duration',
 'show_filename_prefix',
 'episode_filename_prefix',
 'category',
 'pubdate']

In [19]:
# merge the two dataframes based on their common column (id)
metadata_merged = pd.merge(metadata, metadata_err, on=
                           ['show_uri','show_name','show_description','publisher', 'language','rss_link','episode_uri','episode_name',
                            'episode_description','duration','show_filename_prefix','episode_filename_prefix'], how='left')

print(metadata_merged.columns)
metadata_merged.isna().sum()

Index(['show_uri', 'show_name', 'show_description', 'publisher', 'language',
       'rss_link', 'episode_uri', 'episode_name', 'episode_description',
       'duration', 'show_filename_prefix', 'episode_filename_prefix',
       'category_x', 'pubdate_x', 'category_y', 'pubdate_y'],
      dtype='object')


show_uri                       0
show_name                      0
show_description               2
publisher                      0
language                       0
rss_link                       0
episode_uri                    0
episode_name                   0
episode_description          205
duration                       0
show_filename_prefix           0
episode_filename_prefix        0
category_x                 30276
pubdate_x                  30266
category_y                 81084
pubdate_y                  95114
dtype: int64

In [20]:
# fill the missing values in the merged dataframe with the values from the metadata_err dataframe
metadata_merged['category_x'].fillna(metadata_merged['category_y'], inplace=True)
metadata_merged['pubdate_x'].fillna(metadata_merged['pubdate_y'], inplace=True)

# drop the duplicate columns
metadata_merged.drop(['category_y', 'pubdate_y'], axis=1, inplace=True)
metadata_merged.rename(columns={'category_x': 'category', 'pubdate_x': 'pubdate'}, inplace=True)

# the merged dataframe now contains the missing values from the metadata_err dataframe
metadata_merged.isna().sum()

show_uri                       0
show_name                      0
show_description               2
publisher                      0
language                       0
rss_link                       0
episode_uri                    0
episode_name                   0
episode_description          205
duration                       0
show_filename_prefix           0
episode_filename_prefix        0
category                    6000
pubdate                    20030
dtype: int64

In [21]:
metadata_merged.shape

(105360, 14)

In [23]:
# Save new metadata to file
metadata_merged.to_csv('metadata.csv.gz', compression='gzip', index=False)