In [8]:
import pandas as pd
pd.options.display.max_rows = 10
import numpy as np 
# pd.set_option('display.max_colwidth', None)
import os
import torch
import re

### **Extracting Test Summaries for Ben Shapiro's Podcast, as they are the only one's available**

In [6]:
import requests
from bs4 import BeautifulSoup
import string
from xml.etree import ElementTree

def rss_to_dataframe_ben(rss_url):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    data = []
    for item in items:
        title = item.find('title').text
        # Use regular expression to extract the episode number more reliably
        episode_number_match = re.search(r'Ep\.?\s*(\d+)', title, re.IGNORECASE)
        episode_number = episode_number_match.group(1) if episode_number_match else 'Unknown'

        # Extract guest name more reliably
        guest_name = title.split('-')[1].strip() if '-' in title else 'Unknown'
        download_url = item.find('enclosure').get('url')
        pub_date = item.find('pubDate').text

        data.append({
            'episode_number': episode_number,
            'guest_name': guest_name,
            'title': title,
            'download_url': download_url,
            'publication_date': pub_date,
        })

    df = pd.DataFrame(data)
    return df

In [9]:
rss_url_ben = "https://feeds.simplecast.com/C0fPpQ64"
df_ben = rss_to_dataframe_ben(rss_url_ben)
df_ben

Unnamed: 0,episode_number,guest_name,title,download_url,publication_date
0,1946,OJ Simpson Killed By Cancer,Ep. 1946 - OJ Simpson Killed By Cancer,https://claritaspod.com/measure/arttrk.com/p/2...,"Fri, 12 Apr 2024 15:24:47 +0000"
1,1945,Inflation Comes In HOT,Ep. 1945 - Inflation Comes In HOT,https://claritaspod.com/measure/arttrk.com/p/2...,"Thu, 11 Apr 2024 14:44:24 +0000"
2,1944,The Abortion Issue Explodes,Ep. 1944 - The Abortion Issue Explodes,https://claritaspod.com/measure/arttrk.com/p/2...,"Wed, 10 Apr 2024 15:06:04 +0000"
3,1943,Blotting Out The Sun,Ep. 1943 - Blotting Out The Sun,https://claritaspod.com/measure/arttrk.com/p/2...,"Tue, 9 Apr 2024 14:53:39 +0000"
4,1942,Did Joe Biden Just Save Hamas?,Ep. 1942 - Did Joe Biden Just Save Hamas?,https://claritaspod.com/measure/arttrk.com/p/2...,"Mon, 8 Apr 2024 15:22:40 +0000"
...,...,...,...,...,...
2317,5,Rubios PC Problem,Ep. 5 - Rubios PC Problem,https://claritaspod.com/measure/arttrk.com/p/2...,"Wed, 7 Oct 2015 15:00:00 +0000"
2318,4,Russia in Syria,Ep. 4 - Russia in Syria,https://claritaspod.com/measure/arttrk.com/p/2...,"Thu, 1 Oct 2015 15:00:00 +0000"
2319,3,Cruz in the Crosshairs,Ep. 3 - Cruz in the Crosshairs,https://claritaspod.com/measure/arttrk.com/p/2...,"Wed, 30 Sep 2015 15:00:00 +0000"
2320,2,Shout Your Abortion,Ep. 2 - Shout Your Abortion,https://claritaspod.com/measure/arttrk.com/p/2...,"Tue, 22 Sep 2015 15:00:00 +0000"


In [16]:
df_ben['episode_number'] = pd.to_numeric(df_ben['episode_number'], errors='coerce')
df_ben.dropna(subset=['episode_number'], inplace=True)
df_ben = df_ben.loc[df_ben['guest_name'] != 'Unknown']
# df_ben['episode_number'].astype(int)


<class 'pandas.core.frame.DataFrame'>
Index: 1945 entries, 0 to 2321
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   episode_number    1945 non-null   float64
 1   guest_name        1945 non-null   object 
 2   title             1945 non-null   object 
 3   download_url      1945 non-null   object 
 4   publication_date  1945 non-null   object 
dtypes: float64(1), object(4)
memory usage: 91.2+ KB


In [17]:
df_ben['episode_number'] = df_ben['episode_number'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ben['episode_number'] = df_ben['episode_number'].astype(int)


In [18]:
df_ben.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1945 entries, 0 to 2321
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   episode_number    1945 non-null   int64 
 1   guest_name        1945 non-null   object
 2   title             1945 non-null   object
 3   download_url      1945 non-null   object
 4   publication_date  1945 non-null   object
dtypes: int64(1), object(4)
memory usage: 91.2+ KB


In [19]:
# Filter the DataFrame to keep episode numbers from 1919 to 1772
filtered_df = df_ben[(df_ben['episode_number'] <= 1919) & (df_ben['episode_number'] >= 1772)]

In [20]:
filtered_df

Unnamed: 0,episode_number,guest_name,title,download_url,publication_date
34,1919,Nikki Haley Is OUT,Ep. 1919 - Nikki Haley Is OUT,https://claritaspod.com/measure/arttrk.com/p/2...,"Wed, 6 Mar 2024 16:20:13 +0000"
35,1918,Peeing in Your Own Eyes To Stop Donald Trump,Ep. 1918 - Peeing in Your Own Eyes To Stop Don...,https://claritaspod.com/measure/arttrk.com/p/2...,"Tue, 5 Mar 2024 16:03:39 +0000"
36,1917,UNANIMOUS Supreme Court Puts Trump Back On The...,Ep. 1917 - UNANIMOUS Supreme Court Puts Trump ...,https://claritaspod.com/measure/arttrk.com/p/2...,"Mon, 4 Mar 2024 16:14:37 +0000"
38,1916,Duel At The Border,Ep. 1916 - Duel At The Border,https://claritaspod.com/measure/arttrk.com/p/2...,"Fri, 1 Mar 2024 16:18:45 +0000"
39,1915,THE DIRTY SECRET: Trump Is The 2024 Moderate,Ep. 1915 - THE DIRTY SECRET: Trump Is The 2024...,https://claritaspod.com/measure/arttrk.com/p/2...,"Thu, 29 Feb 2024 16:15:29 +0000"
...,...,...,...,...,...
228,1776,MORE Trump Charges?!,Ep. 1776 - MORE Trump Charges?!,https://claritaspod.com/measure/arttrk.com/p/2...,"Fri, 28 Jul 2023 15:45:00 +0000"
229,1775,The Hunter Biden Sweetheart Deal Falls Apart,Ep. 1775 - The Hunter Biden Sweetheart Deal Fa...,https://claritaspod.com/measure/arttrk.com/p/2...,"Thu, 27 Jul 2023 15:45:00 +0000"
230,1774,Hunter's Sweetheart Plea Deal and The Continui...,Ep. 1774 - Hunter's Sweetheart Plea Deal and T...,https://claritaspod.com/measure/arttrk.com/p/2...,"Wed, 26 Jul 2023 15:45:00 +0000"
231,1773,Biden Will Be Impeached,Ep. 1773 - Biden Will Be Impeached,https://claritaspod.com/measure/arttrk.com/p/2...,"Tue, 25 Jul 2023 15:45:00 +0000"


In [21]:

def rss_to_summaries(rss_url, start_ep=1919, end_ep=1772):
    response = requests.get(rss_url)
    root = ElementTree.fromstring(response.content)
    channel = root.find('channel')
    items = channel.findall('item')

    summaries = []
    for item in items:
        title = item.find('title').text
        # Use regular expression to extract the episode number more reliably
        episode_number_match = re.search(r'Ep\.?\s*(\d+)', title, re.IGNORECASE)
        if episode_number_match:
            episode_number = int(episode_number_match.group(1))
            # Filter episodes within the specified range
            if end_ep <= episode_number <= start_ep:
                summary_element = item.find('.//{http://www.itunes.com/dtds/podcast-1.0.dtd}summary')
                summary_text = summary_element.text if summary_element is not None else "No summary available"
                summaries.append(summary_text)

    return summaries




In [None]:
# Example usage
rss_url = "https://feeds.simplecast.com/C0fPpQ64"
episode_summaries = rss_to_summaries(rss_url)

episode_summaries

In [23]:
len(episode_summaries)

148

In [36]:
episode_summaries[:2]

['Former UN Ambassador Nikki Haley drops out of the race; Arizona Senator Krysten Sinema decides not to run after all; and Joe Biden quotes the Cookie Monster as he prepares for his most consequential State of the Union address.\n\nClick here to join the member exclusive portion of my show: https://utm.io/ueSEj\n\nEp.1919\n\n- - -\xa0\n\nDailyWire+:\n\nWatch Bill Whittle’s An Empire of Terror only on DailyWire+: https://bit.ly/4aink3N\n\nUnlock your Bentkey 14-day free trial here: https://bit.ly/3GSz8go\n\nBecome a DailyWire+ member to gain access to movies, shows, documentaries, kids entertainment and more: https://utm.io/ueMfc\xa0\n\nGet your Ben Shapiro merch here: https://bit.ly/3TAu2cw\n\n\xa0- - -\xa0\n\nToday’s Sponsors:\n\nPureTalk - Get a FREE Samsung 5G smartphone. Enter promo code: Shapiro at\xa0 https://www.puretalkusa.com/landing/shapiro\n\nCurrent - Simplify your banking with Current today! http://www.current.com/shapiro\n\nFood For The Poor - Donate Today! Text ‘Plate’ t

In [29]:
df = pd.read_csv('Final_df_with_summaries.csv')
gen_summaries = df['summaries'].to_list()

In [30]:
gen_summaries[:2]

['well it is the end of the road for nickey haley the former you an ambassador under a donald trump former governor of south carolina she is droping out of the race to day because donald trum dominated supertuesday. She is not going to announce an indorsement on wednesday however she is going to encourage donal trump to earn the support of republicans and independent voters who have backed her. she is hoping that he picks her as sort of a unity ticket bot given his dominant performente in the primaryes very unlikely.',
 'well yesterday the supreme court disappointed every one on the far left by ruling nin nothing that actually states cant just random we take presidential candidates off the ballat. If you are going to defind insurrection under the fourteenth amendment that have to be done by congress not by some mirando at the state level. The nine o decision has made people very very very angry on the left they believe that they understood the law and by understood the waw they mean th

In [37]:
def clean_summaries_first_sentence(summaries):
    cleaned_summaries = []
    for summary in summaries:
        # Regex to find the first sentence ending with a period followed by a space, newline or end of string
        first_sentence = re.match(r'([^.]*\.)', summary)
        if first_sentence:
            cleaned_summaries.append(first_sentence.group(1).strip())
        else:
            # If no period is found, use the whole summary
            cleaned_summaries.append(summary.strip())
    return cleaned_summaries


cleaned_ep_sums = clean_summaries_first_sentence(episode_summaries)

In [39]:
cleaned_ep_sums[:2]

['Former UN Ambassador Nikki Haley drops out of the race; Arizona Senator Krysten Sinema decides not to run after all; and Joe Biden quotes the Cookie Monster as he prepares for his most consequential State of the Union address.',
 'The Supreme Court rules in favor of keeping Trump on the ballot, and the Left melts down; the Biden administration unleashes its foreign policy envoy to teach\xa0GenderQueer\xa0to the world; and New York and California hit on a new crime strategy.']

In [40]:
len(gen_summaries[0])

520

In [41]:
len(cleaned_ep_sums[0])

226

In [42]:
def standardize_summaries(summaries):
    standardized_summaries = []
    for summary in summaries:
        # Lowercase the summary
        summary = summary.lower()
        
        # Simplify complex structures
        summary = re.sub(r'; and', '.', summary)  # Split compound sentences into simpler sentences
        
        # Normalize whitespace
        summary = re.sub(r'\s+', ' ', summary).strip()
        
        standardized_summaries.append(summary)
    return standardized_summaries


In [43]:
stnd_cleaned_ep_sums = standardize_summaries(cleaned_ep_sums)

In [44]:
stnd_cleaned_ep_sums[:2]

['former un ambassador nikki haley drops out of the race; arizona senator krysten sinema decides not to run after all. joe biden quotes the cookie monster as he prepares for his most consequential state of the union address.',
 'the supreme court rules in favor of keeping trump on the ballot, and the left melts down; the biden administration unleashes its foreign policy envoy to teach genderqueer to the world. new york and california hit on a new crime strategy.']

In [58]:
import re

def advanced_standardize_summaries(summaries):
    standardized_summaries = []
    for summary in summaries:
        # Lowercase the summary
        summary = summary.lower()
        
        # Remove URLs
        summary = re.sub(r'http[s]?://\S+', '', summary)  # Regex to remove URLs
        
        # Truncate text starting from '? click'
        summary = re.sub(r'\? click.*', '', summary, flags=re.IGNORECASE)
        
        # Remove semicolons
        summary = summary.replace(';', '')
        
        # Simplify complex structures
        summary = re.sub(r'; and', '.', summary)  # Replace remaining semicolons if any
        
        # Normalize whitespace
        summary = re.sub(r'\s+', ' ', summary).strip()
        
        standardized_summaries.append(summary)
    return standardized_summaries




In [59]:
adv_stnd_cleaned_eps = advanced_standardize_summaries(cleaned_ep_sums)

In [None]:
adv_stnd_cleaned_eps

In [65]:
if len(df) >= 148:
    # Create the new column and initialize with None or an empty string
    df['original_summaries_cleaned_standardized'] = None
    
    # Assign the cleaned summaries to the first 148 rows
    df.loc[0:147, 'original_summaries_cleaned_standardized'] = adv_stnd_cleaned_eps
else:
    print("DataFrame is too short to assign 148 summaries.")

### **Now this CSV File has test summaries**

In [68]:
df.drop('Unnamed: 0', axis=1, inplace=True)

df.to_csv('Final_df_with_summaries_and_test_summaries.csv', index=False)

### **Evaluation Metrics**

In [70]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(system_summaries, reference_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for system, reference in zip(system_summaries, reference_summaries):
        scores = scorer.score(reference, system)
        rouge_scores.append(scores)
    return rouge_scores



In [71]:
# Extract the summaries from the DataFrame
system_summaries = df['summaries'].dropna().tolist()
reference_summaries = df['original_summaries_cleaned_standardized'].dropna().tolist()

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(system_summaries, reference_summaries)

In [88]:
# Initialize max dictionaries
max_scores = {
    'rouge1': {'fmeasure': 0, 'precision': 0, 'recall': 0},
    'rouge2': {'fmeasure': 0, 'precision': 0, 'recall': 0},
    'rougeL': {'fmeasure': 0, 'precision': 0, 'recall': 0}
}

# Process each entry in rouge_scores
for score_dict in rouge_scores:
    for key, score in score_dict.items():
        if key in max_scores:  # Ensure the key is valid
            # Update max scores
            max_scores[key]['fmeasure'] = max(max_scores[key]['fmeasure'], score.fmeasure)
            max_scores[key]['precision'] = max(max_scores[key]['precision'], score.precision)
            max_scores[key]['recall'] = max(max_scores[key]['recall'], score.recall)

# Print maximum scores
print("Best Case ROUGE Scores:")
for key, metrics in max_scores.items():
    print(f"{key}: Precision={metrics['precision']:.4}, Recall={metrics['recall']:.4}")



Best Case ROUGE Scores:
rouge1: Precision=0.4516, Recall=0.8889
rouge2: Precision=0.3696, Recall=0.8
rougeL: Precision=0.4516, Recall=0.8889
