In [18]:
''' import libraries
Autotokenizer: to count the number of tokens in a text
pipeline: to use the sentiment analysis model from huggingface
pandas: manipulate dataframes'''

from transformers import AutoTokenizer, pipeline
import pandas as pd

In [19]:
''' load the data'''

path_articles = r'FineTunersAI\Data\cleaned_output.csv'
news_articles = pd.read_csv(path_articles)


In [126]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Function to check if a paragraph is likely an ad or irrelevant
def is_irrelevant_paragraph(paragraph):
    text = paragraph.get_text().strip()
    if not text:
        return True
    if "menu-section-heading" in paragraph.get("class", []):
        return True
    if text.startswith("©") or "byline-bio" in paragraph.get("class", []):
        return True
    if paragraph.find('a') and 'href' in str(paragraph):
        return True
    return False

# Function to process HTML content and extract relevant full text
def process_html(content):
    soup = BeautifulSoup(content, 'html.parser')
    paragraphs = soup.find_all('p')
    
    # Filter out irrelevant paragraphs and concatenate
    relevant_paragraphs = [str(p) for p in paragraphs if not is_irrelevant_paragraph(p)]
    full_text = ''.join(relevant_paragraphs)
    
    return full_text

# Creating a new dataframe for cleaned data
cleaned_data = []

for index, row in news_articles.iterrows():
    full_text = process_html(row["content"])
    cleaned_data.append({
        "link": row["link"],
        "title": row["title"],
        "content": full_text
    })

cleaned_articles = pd.DataFrame(cleaned_data)

#remove all emojis and <p class=""> tags from the content

def remove_emojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def remove_p_tags(text):
    return re.sub(r'<p class=".*?">', '', text)

cleaned_articles['content'] = cleaned_articles['content'].apply(remove_emojis)

# Output the cleaned dataframe
display(cleaned_articles.head(3))  # Prints first few rows of the dataframe
len(cleaned_articles)


Unnamed: 0,link,title,content
0,https://www.today.com/news/onewheel-electric-s...,Onewheel electric skateboards recalled after 4...,"<p class="""">The self-balancing skateboards, ma..."
1,https://www.today.com/news/powerball-no-winner...,"Powerball draw produces no winners, pushing ja...","<p class="""">Saturday nights Powerball lottery ..."
2,https://www.today.com/popculture/travis-kelce-...,Travis Kelce and Chiefs hilariously trade Swif...,"<p class="""">Travis is so proud, one fan commen..."


1190

In [127]:
# delete the rows with content with less than 500 characters

cleaned_articles = cleaned_articles[cleaned_articles['content'].str.len() > 500]
len(cleaned_articles)

1161

In [128]:
''' load the same tokenizer as the one used to train the model'''

tokenizer_sentiment = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")

In [129]:
''' define a function that takes a text as input and returns the sentiment label and score
if the text is too long, it will be truncated to 500 tokens'''

def get_sentiment(text):
    encoded_input = tokenizer_sentiment(
        text, 
        truncation=True, 
        max_length=500, 
        return_tensors=None
    )
    truncated_text = tokenizer_sentiment.decode(encoded_input["input_ids"])
    sentiment_check = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        top_k=3
    )
    sentiment_result = sentiment_check(truncated_text)

    # Extract the first label and score from the first list
    sentiment_label = sentiment_result[0][0]['label']
    sentiment_score = sentiment_result[0][0]['score']
    return sentiment_label, sentiment_score

In [130]:
''' For testing purposes create a short dataframe. Otherwise news_short = news
Assign the sentiment and score to the news_short then filter only the positive news as a new DataFrame'''

# display progress bar
from tqdm import tqdm

# Create a short dataframe for testing purposes
articles_short = cleaned_articles.copy().head(20)

# Create a new column for sentiment and score and apply the get_sentiment function with tqdm
tqdm.pandas()
articles_short[['sentiment', 'score']] = articles_short['content'].progress_apply(lambda x: pd.Series(get_sentiment(x)))

# Filter only the positive news as a new DataFrame that have a score higher than 0.5
positive_news = articles_short[(articles_short['sentiment'] == 'positive') & (articles_short['score'] > 0.5)]

100%|██████████| 20/20 [00:41<00:00,  2.08s/it]


In [131]:
positive_news.sort_values(by=['score'], ascending=False).head(3)

Unnamed: 0,link,title,content,sentiment,score
8,https://www.today.com/food/creative-coffee-dri...,Creative coffee drinks for any time of day,"<p class=""""><em>(Sponsored by Folgers.)</em></...",positive,0.921967
5,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,"<p class="""">Are you one of the 550,000 Start T...",positive,0.808883
14,https://www.today.com/parents/dads/maya-hawke-...,Ethan Hawke and daughter Maya Hawke release so...,"<p class="""">Maya Hawke's parents, Ethan Hawke ...",positive,0.732843


In [132]:
import pandas as pd
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

# Initialize a tokenizer
tokenizer_summary = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Function to split content into paragraphs
def split_into_paragraphs(content):
    soup = BeautifulSoup(content, 'html.parser')
    return [p.get_text().strip() for p in soup.find_all('p') if p.get_text().strip()]

# Function to group paragraphs with a token limit
def group_paragraphs(paragraphs, token_limit=900):
    grouped_paragraphs = []
    current_group = []
    current_count = 0

    for para in paragraphs:
        para_tokens = tokenizer_summary.tokenize(para)
        para_token_count = len(para_tokens)

        if current_count + para_token_count > token_limit and current_group:
            grouped_paragraphs.append(" ".join(current_group))
            current_group = []
            current_count = 0

        current_group.append(para)
        current_count += para_token_count

    if current_group:
        grouped_paragraphs.append(" ".join(current_group))

    return grouped_paragraphs

# Splitting content based on token count
paragraphs_data = []

for index, row in positive_news.iterrows():
    paragraphs = split_into_paragraphs(row["content"])
    total_tokens = sum(len(tokenizer_summary.tokenize(p)) for p in paragraphs)

    if total_tokens > 900:
        grouped_paras = group_paragraphs(paragraphs)
        for i, grouped_para in enumerate(grouped_paras):
            paragraphs_data.append({
                "link": row["link"],
                "title": row["title"],
                "paragraph_number": i + 1,
                "content": grouped_para
            })
    else:
        paragraphs_data.append({
            "link": row["link"],
            "title": row["title"],
            "paragraph_number": 1,
            "content": row["content"]
        })

paragraphs_df = pd.DataFrame(paragraphs_data)

In [133]:
paragraphs_df

Unnamed: 0,link,title,paragraph_number,content
0,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,1,"<p class="""">Are you one of the 550,000 Start T..."
1,https://www.today.com/food/creative-coffee-dri...,Creative coffee drinks for any time of day,1,"<p class=""""><em>(Sponsored by Folgers.)</em></..."
2,https://www.today.com/shop/self-care-bath-prod...,Calm the chaos with these 22 spa-like products...,1,While it's nice to dream about regular facials...
3,https://www.today.com/shop/self-care-bath-prod...,Calm the chaos with these 22 spa-like products...,2,Sometimes relaxing means sitting in the tub wi...
4,https://www.today.com/popculture/tv/dancing-wi...,Latin night says goodbye to one child star in ...,1,"<p class="""">Another celebrity's journey on ""Da..."
5,https://www.today.com/parents/dads/maya-hawke-...,Ethan Hawke and daughter Maya Hawke release so...,1,"Maya Hawke's parents, Ethan Hawke and Uma Thur..."
6,https://www.today.com/parents/dads/maya-hawke-...,Ethan Hawke and daughter Maya Hawke release so...,2,Thurman also opened up about what made it poss...
7,https://www.today.com/life/holidays/celebrity-...,The stars go all out for Halloween 2023,1,"<p class="""">This year, the spooky Oct. 31 holi..."
8,https://www.today.com/parents/celebrity/neil-p...,Neil Patrick Harris and his family do the best...,1,"<p class="""">Step aside, pumpkins. Hold on to y..."
9,https://www.today.com/popculture/kardashian-je...,See the Kardashian-Jenner family’s sweet and s...,1,"<p class="""">""So good!! ,"" one person said.</p>..."


In [134]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [114]:
from tqdm import tqdm
# Apply tqdm to show progress
tqdm.pandas()

def safe_summarize(text):
    try:
        summary = summarizer(text, max_length=300, min_length=50, do_sample=False, truncation=True)
        if summary:
            return summary[0]['summary_text']
        else:
            return "No summary available"
    except Exception as e:
        return f"Error in summarization: {str(e)}"

paragraphs_df['summary'] = paragraphs_df['content'].progress_apply(safe_summarize)

# Sort by 'link' and 'paragraph_number', then group by 'link' and join summaries
sorted_df = paragraphs_df.sort_values(by=['link', 'paragraph_number'])
final_df = sorted_df.groupby('link').agg({'title': 'first', 'summary': ' '.join}).reset_index()


 19%|█▉        | 8/42 [01:03<05:15,  9.29s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors
 90%|█████████ | 38/42 [06:28<00:38,  9.55s/it]Your max_length is set to 100, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
100%|██████████| 42/42 [07:07<00:00, 10.40s/it]Your max_length is set to 100, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
100%|██████████| 42/42 [07:12<00:00, 10.31s/it]

                                                 link  \
0   https://www.today.com/food/creative-coffee-dri...   
1   https://www.today.com/food/restaurants/taco-be...   
2   https://www.today.com/health/diet-fitness/star...   
3   https://www.today.com/health/mind-body/cat-pre...   
4   https://www.today.com/health/pete-bridgette-wi...   
5   https://www.today.com/life/holidays/blog/2023-...   
6   https://www.today.com/life/holidays/celebrity-...   
7   https://www.today.com/life/holidays/thanksgivi...   
8   https://www.today.com/parents/celebrity/neil-p...   
9   https://www.today.com/parents/dads/maya-hawke-...   
10  https://www.today.com/popculture/heidi-klum-ha...   
11  https://www.today.com/popculture/kardashian-je...   
12  https://www.today.com/popculture/movies/witch-...   
13  https://www.today.com/popculture/news/live-blo...   
14  https://www.today.com/popculture/north-west-in...   
15  https://www.today.com/popculture/today-show-ha...   
16  https://www.today.com/popcu




In [151]:
final_df

Unnamed: 0,link,title,summary
0,https://www.today.com/food/creative-coffee-dri...,Creative coffee drinks for any time of day,"This drink is like the sophisticated, nuanced,..."
1,https://www.today.com/food/restaurants/taco-be...,Taco Bell is giving away its new breakfast tac...,Customers with a Taco Lovers Pass can choose f...
2,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,Send us a snap of your walk or run outdoors. ...
3,https://www.today.com/health/mind-body/cat-pre...,Woman went to her pulmonologist for COPD. Her ...,Dr. Earl King is a pulmonologist at Sentara RM...
4,https://www.today.com/health/pete-bridgette-wi...,Pete Sampras reveals wife Bridgette has ovaria...,Tennis legend Pete Sampras has revealed that h...
5,https://www.today.com/life/holidays/blog/2023-...,TODAY Halloween 2023 live updates: See all the...,The whole gang is in on this years big Plaza r...
6,https://www.today.com/life/holidays/celebrity-...,The stars go all out for Halloween 2023,Error in summarization: index out of range in ...
7,https://www.today.com/life/holidays/thanksgivi...,60 best Thanksgiving songs to play during dinn...,Don Henley pays homage to baseball in his popu...
8,https://www.today.com/parents/celebrity/neil-p...,Neil Patrick Harris and his family do the best...,The family of four unveiled their Halloween 20...
9,https://www.today.com/parents/dads/maya-hawke-...,Ethan Hawke and daughter Maya Hawke release so...,"Maya Hawke and her father, Ethan Hawke, record..."


In [148]:
prod_df = final_df.copy()

In [152]:
tqdm.pandas()
prod_df[['sentiment', 'score']] = prod_df['summary'].progress_apply(lambda x: pd.Series(get_sentiment(x)))

# Filter only the positive news as a new DataFrame that have a score higher than 0.5
prod_positive = prod_df[(prod_df['sentiment'] == 'positive') & (prod_df['score'] > 0.7)]

100%|██████████| 26/26 [01:01<00:00,  2.36s/it]


In [153]:
prod_positive

Unnamed: 0,link,title,summary,sentiment,score
0,https://www.today.com/food/creative-coffee-dri...,Creative coffee drinks for any time of day,"This drink is like the sophisticated, nuanced,...",positive,0.795621
7,https://www.today.com/life/holidays/thanksgivi...,60 best Thanksgiving songs to play during dinn...,Don Henley pays homage to baseball in his popu...,positive,0.8361
8,https://www.today.com/parents/celebrity/neil-p...,Neil Patrick Harris and his family do the best...,The family of four unveiled their Halloween 20...,positive,0.753958
9,https://www.today.com/parents/dads/maya-hawke-...,Ethan Hawke and daughter Maya Hawke release so...,"Maya Hawke and her father, Ethan Hawke, record...",positive,0.878997
14,https://www.today.com/popculture/north-west-in...,North West was asked who her style icon is. Yo...,The 10-year-old also talked about her love of ...,positive,0.822209
16,https://www.today.com/popculture/tv/dancing-wi...,Latin night says goodbye to one child star in ...,Zoey 101 star Jamie Lynn Spears took a bow aft...,positive,0.705338
17,https://www.today.com/shop/11-best-gift-ideas-...,32 best gift ideas for seniors that they'll lo...,When it comes to gifts for seniors its all abo...,positive,0.779
20,https://www.today.com/shop/best-gifts-21-year-...,"Starting at $12, these gifts for 21-year-olds ...",The Chargetree Swing allows them to charge the...,positive,0.866854
21,https://www.today.com/shop/best-gifts-30-year-...,45 gifts for 30-year-olds that will effortless...,The Legend of Zelda is a classic video game th...,positive,0.824267
22,https://www.today.com/shop/best-slip-on-sneake...,Editors and experts swear by these slip-on sne...,Slip-on sneakers are easier to take on and off...,positive,0.775693


In [155]:
summary_df = prod_positive[['summary']]

# Exporting to CSV
summary_df.to_csv('summaries.csv', index=False)