In [1]:
import requests
from bs4 import BeautifulSoup
from pydub import AudioSegment
import os
import string
import pandas as pd 
import re
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 500


### **Joe Rogan**

In [2]:
import requests
from xml.etree import ElementTree

def rss_to_dataframe(rss_url):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    for item in items:
        title = item.find('title').text
        episode_number = title.split('-')[0].strip('#').strip()
        # Attempt to extract the guest name from the title
        guest_name = ' '.join(title.split('-')[1:]).strip() if '-' in title else 'Unknown'
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text
        # description = item.find('description').text

        data.append({
            'episode_number': episode_number,
            'guest_name': guest_name,
            'title': title,
            'download_url': download_url,
            'publication_date': pub_date,
            # 'description': description
        })

    df = pd.DataFrame(data)
    return df


In [3]:
rss_url = 'https://feeds.megaphone.fm/GLT1412515089'
df = rss_to_dataframe(rss_url)
df.head(5)

Unnamed: 0,episode_number,guest_name,title,download_url,publication_date
0,2116,Kevin James,#2116 - Kevin James,https://traffic.megaphone.fm/GLT9792092908.mp3?updated=1709916107,"Fri, 08 Mar 2024 18:00:00 -0000"
1,2115,Riley Gaines,#2115 - Riley Gaines,https://traffic.megaphone.fm/GLT2576028300.mp3?updated=1709826288,"Thu, 07 Mar 2024 18:00:00 -0000"
2,2114,Zack Snyder,#2114 - Zack Snyder,https://traffic.megaphone.fm/GLT5800487718.mp3?updated=1709745613,"Wed, 06 Mar 2024 20:53:00 -0000"
3,2113,Christopher Rufo,#2113 - Christopher Rufo,https://traffic.megaphone.fm/GLT5991726151.mp3?updated=1709675927,"Tue, 05 Mar 2024 18:00:00 -0000"
4,2112,Dan Soder,#2112 - Dan Soder,https://traffic.megaphone.fm/GLT4851895679.mp3?updated=1709313869,"Fri, 01 Mar 2024 18:00:00 -0000"


In [4]:
df_original = rss_to_dataframe(rss_url)
df_original

Unnamed: 0,episode_number,guest_name,title,download_url,publication_date
0,2116,Kevin James,#2116 - Kevin James,https://traffic.megaphone.fm/GLT9792092908.mp3?updated=1709916107,"Fri, 08 Mar 2024 18:00:00 -0000"
1,2115,Riley Gaines,#2115 - Riley Gaines,https://traffic.megaphone.fm/GLT2576028300.mp3?updated=1709826288,"Thu, 07 Mar 2024 18:00:00 -0000"
2,2114,Zack Snyder,#2114 - Zack Snyder,https://traffic.megaphone.fm/GLT5800487718.mp3?updated=1709745613,"Wed, 06 Mar 2024 20:53:00 -0000"
3,2113,Christopher Rufo,#2113 - Christopher Rufo,https://traffic.megaphone.fm/GLT5991726151.mp3?updated=1709675927,"Tue, 05 Mar 2024 18:00:00 -0000"
4,2112,Dan Soder,#2112 - Dan Soder,https://traffic.megaphone.fm/GLT4851895679.mp3?updated=1709313869,"Fri, 01 Mar 2024 18:00:00 -0000"
...,...,...,...,...,...
2270,5,"John Heffron, Ari Shaffir (Part 1)","#5 - John Heffron, Ari Shaffir (Part 1)",https://traffic.megaphone.fm/GLT6509617473.mp3?updated=1707793295,"Thu, 21 Jan 2010 00:00:00 -0000"
2271,5,"John Heffron, Ari Shaffir (Part 2)","#5 - John Heffron, Ari Shaffir (Part 2)",https://traffic.megaphone.fm/GLT1638709117.mp3?updated=1707792863,"Thu, 21 Jan 2010 00:00:00 -0000"
2272,3,Ari Shaffir,#3 - Ari Shaffir,https://traffic.megaphone.fm/GLT3348401207.mp3?updated=1707791361,"Wed, 06 Jan 2010 00:00:00 -0000"
2273,2,Brian Redban,#2 - Brian Redban,https://traffic.megaphone.fm/GLT3748439860.mp3?updated=1707792837,"Tue, 29 Dec 2009 00:00:00 -0000"


In [5]:
# Temporary column to check for numeric values
df['is_numeric'] = pd.to_numeric(df['episode_number'], errors='coerce').notna()

df['is_numeric'].value_counts()

is_numeric
True     2040
False     235
Name: count, dtype: int64

In [6]:
df = df[df['is_numeric'] == True]

df['is_numeric'].value_counts()

is_numeric
True    2040
Name: count, dtype: int64

In [7]:
df.drop('is_numeric', inplace=True, axis = 1)

In [8]:
df['episode_number'] = df['episode_number'].astype(int)

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2040 entries, 0 to 2274
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   episode_number    2040 non-null   int64 
 1   guest_name        2040 non-null   object
 2   title             2040 non-null   object
 3   download_url      2040 non-null   object
 4   publication_date  2040 non-null   object
dtypes: int64(1), object(4)
memory usage: 95.6+ KB


In [9]:
print(df['publication_date'].head())

0    Fri, 08 Mar 2024 18:00:00 -0000
1    Thu, 07 Mar 2024 18:00:00 -0000
2    Wed, 06 Mar 2024 20:53:00 -0000
3    Tue, 05 Mar 2024 18:00:00 -0000
4    Fri, 01 Mar 2024 18:00:00 -0000
Name: publication_date, dtype: object


In [10]:
df['publication_date'] = pd.to_datetime(df['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')

In [11]:
df['episode_number'].value_counts()

episode_number
136     5
94      4
140     4
134     3
211     2
       ..
1452    1
1453    1
1454    1
1455    1
1       1
Name: count, Length: 2002, dtype: int64

In [12]:
df.drop('guest_name', inplace=True, axis=1)

In [13]:
def load_transcripts(df):
    """
    Loads the transcripts from text files specified in a DataFrame column into a list of lists.

    Each sublist in the returned list contains the transcription text of a single podcast episode.
    If a transcription file is missing or the episode does not have an associated transcript,
    an empty string is added instead.

    Parameters:
        df (pd.DataFrame): The DataFrame containing a 'transcription_file' column with paths to the transcription text files.

    Returns:
        list of lists: A list where each element is a list containing the transcription text of an episode.
                    Episodes without a transcription are represented by an empty string in their sublist.

    Example:
        >>> transcript_texts = load_transcripts(df_tal)
        This returns a list of lists where each sublist contains the transcription text of an episode.
    """

    transcripts = []  # This will hold all transcriptions as a list of lists
    for transcript_file in df['transcription_file']:
        if transcript_file != 'No transcript available':
            try:
                with open(transcript_file, 'r', encoding='utf-8') as file:
                    transcription_text = file.read()
            except FileNotFoundError:
                print(f"File not found: {transcript_file}")
                transcription_text = ""  # Handle missing files by adding an empty string
        else:
            transcription_text = ""  # No transcript available for this row
        
        # Append the transcription text as a new list (to keep it as a list of lists)
        transcripts.append([transcription_text])
    return transcripts

In [14]:
import os

# Define the path to the folder containing the transcriptions
transcriptions_folder = 'JOE'

# Create a new column for the transcriptions, initialize with None or any suitable placeholder

df['transcription_file'] = 'No transcript available'

def find_transcript_file_joe(episode_number):
    """
    Look for a transcript file matching the episode number.
    Returns the filename if found, otherwise returns a placeholder string.
    """
    for filename in os.listdir(transcriptions_folder):
        if filename.startswith(f"#{episode_number} -") and filename.endswith("-transcript.txt"):
            return os.path.join(transcriptions_folder, filename)
    return 'No transcript available'

for index, row in df.iterrows():
    # Get the episode number and find matching transcript file
    transcript_file = find_transcript_file_joe(row['episode_number'])
    
    # Add the file path or name to the DataFrame
    df.at[index, 'transcription_file'] = transcript_file

In [None]:
df_joe_final = df.loc[df['transcription_file'] != 'No transcript available']
df_joe_final.reset_index(drop=True, inplace=True)

In [None]:
# df_joe_final.to_csv('joe_rogan_podcast_and_transcripts.csv',index=True)

In [16]:
# Saving non filtered dataset without transcripts, if they are needed later, remove the .iloc condition
df.reset_index(drop=True, inplace=True)
df.to_csv('joe_rogan_podcast_dataset.csv', index=True)

### **JOE ROGAN TRANSCRIPTS LOAD**

In [None]:
joe_transcript_texts = load_transcripts(df_joe_final)

In [None]:
len(joe_transcript_texts) == len(df_joe_final)

### **Ben Shapiro**

In [30]:
def rss_to_dataframe_ben(rss_url):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    for item in items:
        title = item.find('title').text
        # Use regular expression to extract the episode number more reliably
        episode_number_match = re.search(r'Ep\.?\s*(\d+)', title, re.IGNORECASE)
        episode_number = episode_number_match.group(1) if episode_number_match else 'Unknown'

        # Extract guest name more reliably
        guest_name = title.split('-')[1].strip() if '-' in title else 'Unknown'
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text

        data.append({
            'episode_number': episode_number,
            'guest_name': guest_name,
            'title': title,
            'download_url': download_url,
            'publication_date': pub_date,
        })

    df = pd.DataFrame(data)
    return df

In [31]:
rss_url_ben = "https://feeds.simplecast.com/C0fPpQ64"
df_ben = rss_to_dataframe_ben(rss_url_ben)
df_ben

Unnamed: 0,episode_number,guest_name,title,download_url,publication_date
0,Unknown,Unknown,"""What We Saw: An Empire of Terror"" w/ Bill Whittle",https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/9144770f-f24a-403a-b125-2c77c42943bc/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=9144770f-f24a-403a-b125-2c77c42943bc&feed=C0fPpQ64,"Sun, 10 Mar 2024 12:00:00 +0000"
1,10,Unknown,Facts Ep. 10: Five Things You Don't Know About Russia,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/58d205a4-b7fe-4157-9c93-2bb9f9e4af2b/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=58d205a4-b7fe-4157-9c93-2bb9f9e4af2b&feed=C0fPpQ64,"Sat, 9 Mar 2024 15:00:00 +0000"
2,1921,STATE OF THE UNION: Old Man Screams At Moon,Ep. 1921 - STATE OF THE UNION: Old Man Screams At Moon,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/80453bb7-e6b6-42b7-a66a-36a80b4b4bc2/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=80453bb7-e6b6-42b7-a66a-36a80b4b4bc2&feed=C0fPpQ64,"Fri, 8 Mar 2024 16:40:16 +0000"
3,Unknown,Unknown,Daily Wire Backstage State of the Union 2024 Coverage,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/80b31a23-5284-44df-a318-21749d08a556/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=80b31a23-5284-44df-a318-21749d08a556&feed=C0fPpQ64,"Fri, 8 Mar 2024 06:02:41 +0000"
4,1920,"IT’S OFFICIALLY ON: Trump vs. Biden II, Electric Boogaloo","Ep. 1920 - IT’S OFFICIALLY ON: Trump vs. Biden II, Electric Boogaloo",https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/77d7a02c-b1e5-4dd7-b46f-39748c35fc94/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=77d7a02c-b1e5-4dd7-b46f-39748c35fc94&feed=C0fPpQ64,"Thu, 7 Mar 2024 15:50:04 +0000"
...,...,...,...,...,...
2288,5,Rubios PC Problem,Ep. 5 - Rubios PC Problem,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/c884125c-5c34-4477-993c-0733d363983f/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=c884125c-5c34-4477-993c-0733d363983f&feed=C0fPpQ64,"Wed, 7 Oct 2015 15:00:00 +0000"
2289,4,Russia in Syria,Ep. 4 - Russia in Syria,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/e2b81619-3581-4d27-bcc7-d0874744e54c/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=e2b81619-3581-4d27-bcc7-d0874744e54c&feed=C0fPpQ64,"Thu, 1 Oct 2015 15:00:00 +0000"
2290,3,Cruz in the Crosshairs,Ep. 3 - Cruz in the Crosshairs,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/84bff4ef-7698-4376-9945-a930b8ed8296/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=84bff4ef-7698-4376-9945-a930b8ed8296&feed=C0fPpQ64,"Wed, 30 Sep 2015 15:00:00 +0000"
2291,2,Shout Your Abortion,Ep. 2 - Shout Your Abortion,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/1461f627-f0b1-478f-ae26-8b32332e065c/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=1461f627-f0b1-478f-ae26-8b32332e065c&feed=C0fPpQ64,"Tue, 22 Sep 2015 15:00:00 +0000"


In [32]:
df_ben['publication_date'] = pd.to_datetime(df_ben['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')

In [33]:
df_ben['episode_number'].value_counts()

episode_number
Unknown    223
10           3
7            3
2            3
6            3
          ... 
1306         1
1307         1
1308         1
1309         1
86           1
Name: count, Length: 1920, dtype: int64

In [34]:
df_ben['episode_number'] = pd.to_numeric(df_ben['episode_number'], errors='coerce')

In [35]:
# Drop rows where episode_number is NaN (i.e., where conversion failed)
df_ben.dropna(subset=['episode_number'], inplace=True)

In [36]:
df_ben = df_ben.loc[df_ben['guest_name'] != 'Unknown']


In [37]:
df_ben['episode_number'] = df_ben['episode_number'].astype(int)

df_ben.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1920 entries, 2 to 2292
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   episode_number    1920 non-null   int64              
 1   guest_name        1920 non-null   object             
 2   title             1920 non-null   object             
 3   download_url      1920 non-null   object             
 4   publication_date  1920 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(1), object(3)
memory usage: 90.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ben['episode_number'] = df_ben['episode_number'].astype(int)


In [38]:
df_ben.drop('guest_name', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ben.drop('guest_name', inplace=True, axis=1)


In [39]:
df_ben

Unnamed: 0,episode_number,title,download_url,publication_date
2,1921,Ep. 1921 - STATE OF THE UNION: Old Man Screams At Moon,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/80453bb7-e6b6-42b7-a66a-36a80b4b4bc2/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=80453bb7-e6b6-42b7-a66a-36a80b4b4bc2&feed=C0fPpQ64,2024-03-08 16:40:16+00:00
4,1920,"Ep. 1920 - IT’S OFFICIALLY ON: Trump vs. Biden II, Electric Boogaloo",https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/77d7a02c-b1e5-4dd7-b46f-39748c35fc94/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=77d7a02c-b1e5-4dd7-b46f-39748c35fc94&feed=C0fPpQ64,2024-03-07 15:50:04+00:00
5,1919,Ep. 1919 - Nikki Haley Is OUT,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/c3d0405a-7fb4-4710-a02d-6915b41482e6/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=c3d0405a-7fb4-4710-a02d-6915b41482e6&feed=C0fPpQ64,2024-03-06 16:20:13+00:00
6,1918,Ep. 1918 - Peeing in Your Own Eyes To Stop Donald Trump,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/1d84c5c5-2b5b-4be6-9708-b7a85359b450/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=1d84c5c5-2b5b-4be6-9708-b7a85359b450&feed=C0fPpQ64,2024-03-05 16:03:39+00:00
7,1917,Ep. 1917 - UNANIMOUS Supreme Court Puts Trump Back On The Ballot,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/d06955a6-e7ad-480a-80b4-cac95f816b4e/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=d06955a6-e7ad-480a-80b4-cac95f816b4e&feed=C0fPpQ64,2024-03-04 16:14:37+00:00
...,...,...,...,...
2288,5,Ep. 5 - Rubios PC Problem,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/c884125c-5c34-4477-993c-0733d363983f/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=c884125c-5c34-4477-993c-0733d363983f&feed=C0fPpQ64,2015-10-07 15:00:00+00:00
2289,4,Ep. 4 - Russia in Syria,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/e2b81619-3581-4d27-bcc7-d0874744e54c/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=e2b81619-3581-4d27-bcc7-d0874744e54c&feed=C0fPpQ64,2015-10-01 15:00:00+00:00
2290,3,Ep. 3 - Cruz in the Crosshairs,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/84bff4ef-7698-4376-9945-a930b8ed8296/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=84bff4ef-7698-4376-9945-a930b8ed8296&feed=C0fPpQ64,2015-09-30 15:00:00+00:00
2291,2,Ep. 2 - Shout Your Abortion,https://claritaspod.com/measure/arttrk.com/p/24FDE/verifi.podscribe.com/rss/p/pfx.vpixl.com/2jSe3/prfx.byspotify.com/e/dts.podtrac.com/redirect.mp3/mgln.ai/e/121/injector.simplecastaudio.com/01514e65-f508-4e0c-99d9-aad07cea61ff/episodes/1461f627-f0b1-478f-ae26-8b32332e065c/audio/128/default.mp3?aid=rss_feed&awCollectionId=01514e65-f508-4e0c-99d9-aad07cea61ff&awEpisodeId=1461f627-f0b1-478f-ae26-8b32332e065c&feed=C0fPpQ64,2015-09-22 15:00:00+00:00


In [40]:
import os

# Define the path to the folder containing the transcriptions
transcriptions_folder = 'BEN'

# Create a new column for the transcriptions, initialize with None or any suitable placeholder

df_ben['transcription_file'] = 'No transcript available'

def find_transcript_file_ben(episode_number):
    """
    Look for a transcript file matching the episode number.
    Returns the filename if found, otherwise returns a placeholder string.
    """
    for filename in os.listdir(transcriptions_folder):
        if filename.startswith(f"Ep. {episode_number} -") and filename.endswith("-transcript.txt"):
            return os.path.join(transcriptions_folder, filename)
    return 'No transcript available'

for index, row in df_ben.iterrows():
    # Get the episode number and find matching transcript file
    transcript_file = find_transcript_file_ben(row['episode_number'])
    
    # Add the file path or name to the DataFrame
    df_ben.at[index, 'transcription_file'] = transcript_file

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ben['transcription_file'] = 'No transcript available'


In [None]:
df_ben_final = df_ben.loc[df_ben['transcription_file'] != 'No transcript available']
df_ben_final.reset_index(drop=True, inplace=True)

In [43]:
# Saving non filtered dataset without transcripts, if they are needed later, remove the .iloc condition
df_ben.reset_index(drop=True, inplace=True)
df_ben.to_csv('ben_shapiro_podcast_dataset.csv', index=True)

### **BEN SHAPIRO TRANSCRIPTS LOAD**

In [None]:
ben_transcript_texts = load_transcripts(df_ben_final)

In [None]:
len(ben_transcript_texts) == len(df_ben_final)

# **This American Life**

In [44]:
from lxml import etree as ET

def rss_to_dataframe_american_life_from_file(filename):
    with open(filename, 'rb') as file:  # Open as binary due to lxml's handling
        content = file.read()
    root = ET.fromstring(content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    episode_count = 0

    for item in items:
        if episode_count >= 1000:
            break

        title = item.find('title').text
        episode_number_match = re.match(r'(\d+):', title)
        episode_number = episode_number_match.group(1) if episode_number_match else 'Unknown'
        
        description = item.find('description').text
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text

        data.append({
            'episode_number': episode_number,
            'title': title,
            # 'description': description,
            'download_url': download_url,
            'publication_date': pub_date,
        })

        episode_count += 1

    df = pd.DataFrame(data)
    return df

In [45]:
rss_url_american_life = "TALArchive.xml"
df_american_life = rss_to_dataframe_american_life_from_file(rss_url_american_life)
df_american_life

Unnamed: 0,episode_number,title,download_url,publication_date
0,825,825: Yousef,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/4ffd11f6-dc7b-4682-8080-378f24f41c59/audio/128/default.mp3,"Fri, 01 Mar 2024 18:00:00 -0400"
1,824,824: Family Meeting,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/65312b31-dc48-4dde-adb2-269d8f7da365/audio/128/default.mp3,"Fri, 23 Feb 2024 18:00:00 -0400"
2,823,823: The Question Trap,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/5e22d88d-a289-4135-801a-57e6941a1ea3/audio/128/default.mp3,"Fri, 02 Feb 2024 18:00:00 -0400"
3,822,822: The Words to Say It,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/71a53135-f935-412a-b5bf-a07289de0f30/audio/128/default.mp3,"Fri, 26 Jan 2024 18:00:00 -0400"
4,821,821: Embrace the Suck,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/938413db-5e79-44e4-9bd0-cba28b1eee22/audio/128/default.mp3,"Fri, 19 Jan 2024 18:00:00 -0400"
...,...,...,...,...
819,5,5: Anger and Forgiveness,https://www.thisamericanlife.org/sites/default/files/audio/5/G6vW91kUb18b0Vm6ep5eO8ekl8JT3gPF4SYIDbYAWKo/5.mp3,"Fri, 15 Dec 1995 18:00:00 -0400"
820,4,4: Vacations,https://www.thisamericanlife.org/sites/default/files/audio/4/S4MTKexcwCBxC_OBqs33YVQm2pf85vKt5vmxhsW8hWg/4.mp3,"Fri, 08 Dec 1995 18:00:00 -0400"
821,3,3: Poultry Slam 1995,https://www.thisamericanlife.org/sites/default/files/audio/3/3fZqXh-aDaWx6N3Dg2DkXhAhu5SyUIpxcB16R_Jap90/3.mp3,"Fri, 01 Dec 1995 18:00:00 -0400"
822,2,2: Small Scale Sin,https://www.thisamericanlife.org/sites/default/files/audio/2/s6iVCvP0p-f1qwIeRvubpMzQ8TR05BnA3AsSm1uXuxs/2.mp3,"Fri, 24 Nov 1995 18:00:00 -0400"


In [46]:
df_american_life['episode_number'] = df_american_life['episode_number'].astype(int)
df_american_life['publication_date'] = pd.to_datetime(df_american_life['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')

In [47]:
df_american_life.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824 entries, 0 to 823
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype                    
---  ------            --------------  -----                    
 0   episode_number    824 non-null    int64                    
 1   title             824 non-null    object                   
 2   download_url      824 non-null    object                   
 3   publication_date  824 non-null    datetime64[ns, UTC-04:00]
dtypes: datetime64[ns, UTC-04:00](1), int64(1), object(2)
memory usage: 25.9+ KB


In [48]:
df_american_life

Unnamed: 0,episode_number,title,download_url,publication_date
0,825,825: Yousef,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/4ffd11f6-dc7b-4682-8080-378f24f41c59/audio/128/default.mp3,2024-03-01 18:00:00-04:00
1,824,824: Family Meeting,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/65312b31-dc48-4dde-adb2-269d8f7da365/audio/128/default.mp3,2024-02-23 18:00:00-04:00
2,823,823: The Question Trap,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/5e22d88d-a289-4135-801a-57e6941a1ea3/audio/128/default.mp3,2024-02-02 18:00:00-04:00
3,822,822: The Words to Say It,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/71a53135-f935-412a-b5bf-a07289de0f30/audio/128/default.mp3,2024-01-26 18:00:00-04:00
4,821,821: Embrace the Suck,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/938413db-5e79-44e4-9bd0-cba28b1eee22/audio/128/default.mp3,2024-01-19 18:00:00-04:00
...,...,...,...,...
819,5,5: Anger and Forgiveness,https://www.thisamericanlife.org/sites/default/files/audio/5/G6vW91kUb18b0Vm6ep5eO8ekl8JT3gPF4SYIDbYAWKo/5.mp3,1995-12-15 18:00:00-04:00
820,4,4: Vacations,https://www.thisamericanlife.org/sites/default/files/audio/4/S4MTKexcwCBxC_OBqs33YVQm2pf85vKt5vmxhsW8hWg/4.mp3,1995-12-08 18:00:00-04:00
821,3,3: Poultry Slam 1995,https://www.thisamericanlife.org/sites/default/files/audio/3/3fZqXh-aDaWx6N3Dg2DkXhAhu5SyUIpxcB16R_Jap90/3.mp3,1995-12-01 18:00:00-04:00
822,2,2: Small Scale Sin,https://www.thisamericanlife.org/sites/default/files/audio/2/s6iVCvP0p-f1qwIeRvubpMzQ8TR05BnA3AsSm1uXuxs/2.mp3,1995-11-24 18:00:00-04:00


In [49]:
df_american_life['episode_number'].value_counts()

episode_number
825    1
271    1
281    1
280    1
279    1
      ..
548    1
547    1
546    1
545    1
1      1
Name: count, Length: 824, dtype: int64

In [50]:
df_american_life.head(500)

Unnamed: 0,episode_number,title,download_url,publication_date
0,825,825: Yousef,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/4ffd11f6-dc7b-4682-8080-378f24f41c59/audio/128/default.mp3,2024-03-01 18:00:00-04:00
1,824,824: Family Meeting,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/65312b31-dc48-4dde-adb2-269d8f7da365/audio/128/default.mp3,2024-02-23 18:00:00-04:00
2,823,823: The Question Trap,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/5e22d88d-a289-4135-801a-57e6941a1ea3/audio/128/default.mp3,2024-02-02 18:00:00-04:00
3,822,822: The Words to Say It,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/71a53135-f935-412a-b5bf-a07289de0f30/audio/128/default.mp3,2024-01-26 18:00:00-04:00
4,821,821: Embrace the Suck,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/938413db-5e79-44e4-9bd0-cba28b1eee22/audio/128/default.mp3,2024-01-19 18:00:00-04:00
5,820,820: It Wouldn’t Be Make-Believe If You’d Believe In Me,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/b918094e-979a-44c0-9377-977d396748e4/audio/128/default.mp3,2023-12-22 18:00:00-04:00
6,819,819: Special Bonus Podcast — Yousef’s Week,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/10f46595-7477-4aaa-9d50-806dc8b24ac8/audio/128/default.mp3,2023-12-20 18:00:00-04:00
7,818,818: Stand Clear of the Closing Doors,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/074dae44-9927-43fb-a8df-0694a57ab804/audio/128/default.mp3,2023-12-15 18:00:00-04:00
8,817,817: The Cavalry Is Not Coming,https://www.thisamericanlife.org/sites/default/files/audio/817/M8_Y7B9lX0IF1PvLbF2ZMmzCexd-EDrjy8c61V1tmVE/817.mp3,2023-12-01 18:00:00-04:00
9,816,816: Poultry Slam,https://www.thisamericanlife.org/sites/default/files/audio/816/1FxPlvN4fzd537cBRw2RChcYIr6mTAUJca4ZCeJvuWo/816.mp3,2023-11-24 18:00:00-04:00


In [51]:
df_american_life.sort_values(by='episode_number', ascending=False, inplace=True)


In [52]:
print(df_american_life['title'].head(10))

0                                                825: Yousef
1                                        824: Family Meeting
2                                     823: The Question Trap
3                                   822: The Words to Say It
4                                      821: Embrace the Suck
5    820: It Wouldn’t Be Make-Believe If You’d Believe In Me
6                 819: Special Bonus Podcast — Yousef’s Week
7                      818: Stand Clear of the Closing Doors
8                             817: The Cavalry Is Not Coming
9                                          816: Poultry Slam
Name: title, dtype: object


In [53]:
df_tal = df_american_life

### **Filtering to retain only rows which have transcripts**

In [54]:
import os

# Define the path to the folder containing the transcriptions
transcriptions_folder = 'THIS_AMERICAN_LIFE'

# Create a new column for the transcriptions, initialize with None or any suitable placeholder
df_tal['transcription'] = None

df_tal['transcription_file'] = 'No transcript available'

def find_transcript_file_tal(episode_number):
    """
    Look for a transcript file matching the episode number.
    Returns the filename if found, otherwise returns a placeholder string.
    """
    for filename in os.listdir(transcriptions_folder):
        if filename.startswith(f"{episode_number}_") and filename.endswith("-transcript.txt"):
            return os.path.join(transcriptions_folder, filename)
    return 'No transcript available'

for index, row in df_tal.iterrows():
    # Get the episode number and find matching transcript file
    transcript_file = find_transcript_file_tal(row['episode_number'])
    
    # Add the file path or name to the DataFrame
    df_tal.at[index, 'transcription_file'] = transcript_file


In [56]:
df_tal.drop('transcription', inplace=True, axis=1)

In [57]:
df_tal

Unnamed: 0,episode_number,title,download_url,publication_date,transcription_file
0,825,825: Yousef,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/4ffd11f6-dc7b-4682-8080-378f24f41c59/audio/128/default.mp3,2024-03-01 18:00:00-04:00,THIS_AMERICAN_LIFE/825_ Yousef-transcript.txt
1,824,824: Family Meeting,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/65312b31-dc48-4dde-adb2-269d8f7da365/audio/128/default.mp3,2024-02-23 18:00:00-04:00,THIS_AMERICAN_LIFE/824_ Family Meeting-transcript.txt
2,823,823: The Question Trap,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/5e22d88d-a289-4135-801a-57e6941a1ea3/audio/128/default.mp3,2024-02-02 18:00:00-04:00,THIS_AMERICAN_LIFE/823_ The Question Trap-transcript.txt
3,822,822: The Words to Say It,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/71a53135-f935-412a-b5bf-a07289de0f30/audio/128/default.mp3,2024-01-26 18:00:00-04:00,No transcript available
4,821,821: Embrace the Suck,https://pfx.vpixl.com/6qj4J/dts.podtrac.com/redirect.mp3/chtbl.com/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/bbbcc290-ed3b-44a2-8e5d-5513e38cfe20/episodes/938413db-5e79-44e4-9bd0-cba28b1eee22/audio/128/default.mp3,2024-01-19 18:00:00-04:00,THIS_AMERICAN_LIFE/821_ Embrace the Suck-transcript.txt
...,...,...,...,...,...
819,5,5: Anger and Forgiveness,https://www.thisamericanlife.org/sites/default/files/audio/5/G6vW91kUb18b0Vm6ep5eO8ekl8JT3gPF4SYIDbYAWKo/5.mp3,1995-12-15 18:00:00-04:00,No transcript available
820,4,4: Vacations,https://www.thisamericanlife.org/sites/default/files/audio/4/S4MTKexcwCBxC_OBqs33YVQm2pf85vKt5vmxhsW8hWg/4.mp3,1995-12-08 18:00:00-04:00,No transcript available
821,3,3: Poultry Slam 1995,https://www.thisamericanlife.org/sites/default/files/audio/3/3fZqXh-aDaWx6N3Dg2DkXhAhu5SyUIpxcB16R_Jap90/3.mp3,1995-12-01 18:00:00-04:00,No transcript available
822,2,2: Small Scale Sin,https://www.thisamericanlife.org/sites/default/files/audio/2/s6iVCvP0p-f1qwIeRvubpMzQ8TR05BnA3AsSm1uXuxs/2.mp3,1995-11-24 18:00:00-04:00,No transcript available


In [None]:
df_tal_final = df_tal[df_tal['transcription_file'] != 'No transcript available']

In [None]:
df_tal_final.reset_index(drop=True, inplace=True)

In [58]:
df_tal.to_csv('TAL_podcast_dataset.csv', index=True)

In [None]:
df_tal_final.to_csv('TAL_podcast_and_transcripts.csv', index=True)

### **TAL TRANSCRIPTS LOAD**

In [None]:

tal_transcript_texts = load_transcripts(df_tal_final)

### **Sanity check**

In [None]:
len(tal_transcript_texts) == len(df_tal_final)

### **All Good**

In [None]:
df_tal.info()

# **Andrew Huberman**

In [None]:
def rss_to_dataframe_huberman(rss_url):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    for item in items:
        title = item.find('title').text
        # episode number would be assigned auto incremented as there are no specific numbers in this podcast
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text
        # description = item.find('description').text  

        data.append({
            'title': title,
            'download_url': download_url,
            'publication_date': pub_date,
            # 'description': description  # Including description in the DataFrame
        })

    df = pd.DataFrame(data)
    return df

In [None]:
rss_url_huberman = 'https://feeds.megaphone.fm/hubermanlab'
df_huberman = rss_to_dataframe_huberman(rss_url_huberman)
df_huberman.head(5)

In [71]:

def auto_indexing(df):
    num_rows = len(df)

    decrementing_sequence = range(num_rows-1, -1, -1)

    df['episode_number'] = list(decrementing_sequence)

    return df



In [None]:
df_huberman = auto_indexing(df_huberman)

df_huberman = df_huberman[['episode_number', 'title', 'download_url', 'publication_date']]

In [None]:
df_huberman['episode_number'] = df_huberman['episode_number'].astype(int)

df_huberman['publication_date'] = pd.to_datetime(df_huberman['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')


In [None]:
df_huberman = df_huberman.sort_values(by='publication_date', ascending=False)

In [None]:
import os
import difflib

filenames_huberman = [filename for filename in os.listdir(transcriptions_folder) if filename.endswith("-transcript.txt")]

# Define the path to the folder containing the transcriptions
transcriptions_folder = 'HUBERMAN'

# Create a new column for the transcriptions, initialize with None or any suitable placeholder

df_huberman['transcription_file'] = 'No transcript available'

def find_closest_transcript(title, filenames):
    # Normalize the title to improve matching
    title_normalized = title.lower()
    filenames_normalized = [filename.lower().replace("-transcript.txt", "") for filename in filenames]
    
    # Find the closest match
    matches = difflib.get_close_matches(title_normalized, filenames_normalized, n=1, cutoff=0.4)
    
    if matches:
        # If a match is found, return the original filename (with correct casing and suffix)
        match = matches[0]
        for filename in filenames:
            if filename.lower().startswith(match):
                return os.path.join(transcriptions_folder, filename)
    return 'No transcript available'


for index, row in df_huberman.iterrows():
    transcript_file = find_closest_transcript(row['title'], filenames_huberman)
    df_huberman.at[index, 'transcription_file'] = transcript_file


In [None]:
df_huberman.to_csv('huberman_podcast_and_transcripts.csv', index=True)

### **HUBERMAN TRANSCRIPTS LOAD**

In [None]:
huberman_transcript_texts = load_transcripts(df_huberman)


In [None]:
len(huberman_transcript_texts) == len(df_huberman)

# **VOX**

In [67]:
def rss_to_dataframe_vox(rss_url):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    for item in items:
        title = item.find('title').text
        # episode number would be assigned auto incremented as there are no specific numbers in this podcast
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text
        # description = item.find('description').text  

        data.append({
            'title': title,
            'download_url': download_url,
            'publication_date': pub_date,
            # 'description': description  # Including description in the DataFrame
        })

    df = pd.DataFrame(data)
    return df

In [68]:
rss_url_vox= 'https://feeds.megaphone.fm/VMP5705694065'
df_vox = rss_to_dataframe_vox(rss_url_vox)
df_vox.head(5)

Unnamed: 0,title,download_url,publication_date
0,Bringing back the SAT,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1530889528.mp3?updated=1709921609,"Fri, 08 Mar 2024 19:00:00 -0000"
1,Can Reddit survive going public?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1976859152.mp3?updated=1709836444,"Thu, 07 Mar 2024 19:00:00 -0000"
2,Why measles is back,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP8455187636.mp3?updated=1709753496,"Wed, 06 Mar 2024 19:50:00 -0000"
3,How Israel is upending Democratic races,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7327599366.mp3?updated=1709661154,"Tue, 05 Mar 2024 19:00:00 -0000"
4,Why groceries are still so expensive,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6282330747.mp3?updated=1709586463,"Mon, 04 Mar 2024 19:00:00 -0000"


In [69]:
df_vox

Unnamed: 0,title,download_url,publication_date
0,Bringing back the SAT,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1530889528.mp3?updated=1709921609,"Fri, 08 Mar 2024 19:00:00 -0000"
1,Can Reddit survive going public?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1976859152.mp3?updated=1709836444,"Thu, 07 Mar 2024 19:00:00 -0000"
2,Why measles is back,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP8455187636.mp3?updated=1709753496,"Wed, 06 Mar 2024 19:50:00 -0000"
3,How Israel is upending Democratic races,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7327599366.mp3?updated=1709661154,"Tue, 05 Mar 2024 19:00:00 -0000"
4,Why groceries are still so expensive,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6282330747.mp3?updated=1709586463,"Mon, 04 Mar 2024 19:00:00 -0000"
...,...,...,...
1516,This Time Could Be Different,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6903159456.mp3,"Fri, 23 Feb 2018 22:16:09 -0000"
1517,Countdown to Day Zero,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP9919331674.mp3,"Thu, 22 Feb 2018 20:57:03 -0000"
1518,Breaking the Ice with North Korea,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP3675062005.mp3,"Wed, 21 Feb 2018 20:53:17 -0000"
1519,Black Panther Is the Most Important Movie of 2018,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6640596368.mp3,"Tue, 20 Feb 2018 21:31:56 -0000"


In [73]:
import os
import re

transcriptions_folder = 'VOX'  # Adjust to your actual folder path
df_vox['transcription_file'] = 'No transcript available'

def normalize_string(s):
    """Normalize a string by removing special characters and converting to lowercase."""
    return re.sub(r'[^a-zA-Z0-9\s]', '', s).lower()

def find_normalized_transcript_file(title, folder_path):
    """
    Attempt to find a transcript file with a title matching after normalization.
    Returns the path to the file if found, 'No transcript available' otherwise.
    """
    normalized_title = normalize_string(title)
    for filename in os.listdir(folder_path):
        # Removing the '-transcript.txt' part and normalize
        filename_base = normalize_string(filename.replace('-transcript.txt', ''))
        if normalized_title == filename_base:
            return os.path.join(folder_path, filename)
    return 'No transcript available'

for index, row in df_vox.iterrows():
    transcript_file = find_normalized_transcript_file(row['title'], transcriptions_folder)
    df_vox.at[index, 'transcription_file'] = transcript_file


In [77]:
df_vox_final = df_vox[df_vox['transcription_file']!= 'No transcript available']

df_vox_final.reset_index(drop=True, inplace=True)

In [78]:
df_vox_final

Unnamed: 0,episode_number,title,download_url,publication_date,transcription_file
0,1520,Bringing back the SAT,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1530889528.mp3?updated=1709921609,2024-03-08 19:00:00+00:00,VOX/Bringing back the SAT-transcript.txt
1,1519,Can Reddit survive going public?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1976859152.mp3?updated=1709836444,2024-03-07 19:00:00+00:00,VOX/Can Reddit survive going public?-transcript.txt
2,1518,Why measles is back,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP8455187636.mp3?updated=1709753496,2024-03-06 19:50:00+00:00,VOX/Why measles is back-transcript.txt
3,1517,How Israel is upending Democratic races,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7327599366.mp3?updated=1709661154,2024-03-05 19:00:00+00:00,VOX/How Israel is upending Democratic races-transcript.txt
4,1516,Why groceries are still so expensive,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6282330747.mp3?updated=1709586463,2024-03-04 19:00:00+00:00,VOX/Why groceries are still so expensive-transcript.txt
...,...,...,...,...,...
846,395,John Boltin',https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP2608511590.mp3,2019-09-10 21:27:24+00:00,VOX/John Boltin'-transcript.txt
847,394,Am I gonna die from vaping?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP2063822495.mp3,2019-09-09 21:27:50+00:00,VOX/Am I gonna die from vaping?-transcript.txt
848,393,Gerry with the bad maps,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP3456989366.mp3,2019-09-06 20:20:26+00:00,VOX/Gerry with the bad maps-transcript.txt
849,392,CNN's climate marathon in 20 minutes,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7839076497.mp3,2019-09-05 21:07:57+00:00,VOX/CNN's climate marathon in 20 minutes-transcript.txt


In [79]:
df_vox_final = auto_indexing(df_vox_final)
# df_vox = auto_indexing(df_vox)

# df_vox = df_vox[['episode_number', 'title', 'download_url', 'publication_date', 'transcription_file']]
df_vox_final = df_vox_final[['episode_number', 'title', 'download_url', 'publication_date', 'transcription_file']]

df_vox_final['publication_date'] = pd.to_datetime(df_vox_final['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')
# df_vox['publication_date'] = pd.to_datetime(df_vox['publication_date'], format='%a, %d %b %Y %H:%M:%S %z')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['episode_number'] = list(decrementing_sequence)


In [80]:
df_vox_final

Unnamed: 0,episode_number,title,download_url,publication_date,transcription_file
0,850,Bringing back the SAT,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1530889528.mp3?updated=1709921609,2024-03-08 19:00:00+00:00,VOX/Bringing back the SAT-transcript.txt
1,849,Can Reddit survive going public?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP1976859152.mp3?updated=1709836444,2024-03-07 19:00:00+00:00,VOX/Can Reddit survive going public?-transcript.txt
2,848,Why measles is back,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP8455187636.mp3?updated=1709753496,2024-03-06 19:50:00+00:00,VOX/Why measles is back-transcript.txt
3,847,How Israel is upending Democratic races,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7327599366.mp3?updated=1709661154,2024-03-05 19:00:00+00:00,VOX/How Israel is upending Democratic races-transcript.txt
4,846,Why groceries are still so expensive,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6282330747.mp3?updated=1709586463,2024-03-04 19:00:00+00:00,VOX/Why groceries are still so expensive-transcript.txt
...,...,...,...,...,...
846,4,John Boltin',https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP2608511590.mp3,2019-09-10 21:27:24+00:00,VOX/John Boltin'-transcript.txt
847,3,Am I gonna die from vaping?,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP2063822495.mp3,2019-09-09 21:27:50+00:00,VOX/Am I gonna die from vaping?-transcript.txt
848,2,Gerry with the bad maps,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP3456989366.mp3,2019-09-06 20:20:26+00:00,VOX/Gerry with the bad maps-transcript.txt
849,1,CNN's climate marathon in 20 minutes,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP7839076497.mp3,2019-09-05 21:07:57+00:00,VOX/CNN's climate marathon in 20 minutes-transcript.txt


In [81]:
df_vox.to_csv('vox_podcast_dataset.csv', index=True)

df_vox_final.to_csv('vox_podcast_and_transcripts.csv', index=True)

### **VOX TRANSCRIPTS LOAD**

In [None]:
vox_transcript_texts = load_transcripts(df_vox_final)

In [None]:
len(vox_transcript_texts) == len(df_vox_final)

# **FINAL DF WITH TRANSCRIPTS --> Will be updated as needed**

In [85]:
JOE = pd.read_csv('joe_rogan_podcast_and_transcripts.csv')
BEN = pd.read_csv('ben_shapiro_podcast_and_transcripts.csv')
TAL = pd.read_csv('TAL_podcast_and_transcripts.csv')
HUBERMAN = pd.read_csv('huberman_podcast_and_transcripts.csv')
VOX = pd.read_csv('vox_podcast_and_transcripts.csv')

final_df_with_transcripts = pd.concat([JOE, BEN, TAL, HUBERMAN, VOX], ignore_index=True)

In [86]:
final_df_with_transcripts.drop('Unnamed: 0', inplace=True, axis=1)

In [87]:
final_df_with_transcripts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1395 entries, 0 to 1394
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   episode_number      1395 non-null   int64 
 1   title               1395 non-null   object
 2   download_url        1395 non-null   object
 3   publication_date    1395 non-null   object
 4   transcription_file  1395 non-null   object
dtypes: int64(1), object(4)
memory usage: 54.6+ KB


In [89]:
final_df_with_transcripts.to_csv('final_df_with_transcripts.csv')

# **FINAL DF WITHOUT TRANSCRIPTS --> Will be updated as needed**

In [93]:
JOE_FULL = pd.read_csv('joe_rogan_podcast_dataset.csv')
BEN_FULL = pd.read_csv('ben_shapiro_podcast_dataset.csv')
TAL_FULL = pd.read_csv('TAL_podcast_dataset.csv')
HUBERMAN_FULL = pd.read_csv('huberman_podcast_and_transcripts.csv') # is same with and without transcripts, i.e. all episodes in dataset have transcripts
VOX_FULL = pd.read_csv('vox_podcast_dataset.csv')

final_df_raw = pd.concat([JOE_FULL, BEN_FULL, TAL_FULL, HUBERMAN_FULL, VOX_FULL], ignore_index=True)

In [94]:
final_df_raw.drop('Unnamed: 0', inplace=True, axis=1)

final_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6504 entries, 0 to 6503
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   episode_number      6504 non-null   int64 
 1   title               6504 non-null   object
 2   download_url        6504 non-null   object
 3   publication_date    6504 non-null   object
 4   transcription_file  6504 non-null   object
dtypes: int64(1), object(4)
memory usage: 254.2+ KB


In [95]:
final_df_raw

Unnamed: 0,episode_number,title,download_url,publication_date,transcription_file
0,2116,#2116 - Kevin James,https://traffic.megaphone.fm/GLT9792092908.mp3?updated=1709916107,2024-03-08 18:00:00+00:00,No transcript available
1,2115,#2115 - Riley Gaines,https://traffic.megaphone.fm/GLT2576028300.mp3?updated=1709826288,2024-03-07 18:00:00+00:00,No transcript available
2,2114,#2114 - Zack Snyder,https://traffic.megaphone.fm/GLT5800487718.mp3?updated=1709745613,2024-03-06 20:53:00+00:00,JOE/#2114 - Zack Snyder-transcript.txt
3,2113,#2113 - Christopher Rufo,https://traffic.megaphone.fm/GLT5991726151.mp3?updated=1709675927,2024-03-05 18:00:00+00:00,JOE/#2113 - Christopher Rufo-transcript.txt
4,2112,#2112 - Dan Soder,https://traffic.megaphone.fm/GLT4851895679.mp3?updated=1709313869,2024-03-01 18:00:00+00:00,No transcript available
...,...,...,...,...,...
6499,4,This Time Could Be Different,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6903159456.mp3,2018-02-23 22:16:09+00:00,No transcript available
6500,3,Countdown to Day Zero,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP9919331674.mp3,2018-02-22 20:57:03+00:00,No transcript available
6501,2,Breaking the Ice with North Korea,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP3675062005.mp3,2018-02-21 20:53:17+00:00,No transcript available
6502,1,Black Panther Is the Most Important Movie of 2018,https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/524GE/traffic.megaphone.fm/VMP6640596368.mp3,2018-02-20 21:31:56+00:00,No transcript available


In [97]:
final_df_raw.to_csv('final_df_raw.csv')