In [1]:
''' import libraries
Autotokenizer: to count the number of tokens in a text
pipeline: to use the sentiment analysis model from huggingface
pandas: manipulate dataframes'''

from transformers import AutoTokenizer, pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
''' load the data'''

path_paragraphs = r'..\data\Sprint3 - cleaned_data - Erjon.csv'
path_articles = r'..\data\cleaned_output.csv'

news_articles = pd.read_csv(path_articles)
news_paragraphs = pd.read_csv(path_paragraphs)

In [26]:
import pandas as pd
from bs4 import BeautifulSoup

# Function to check if a paragraph is likely an ad or irrelevant
def is_irrelevant_paragraph(paragraph):
    text = paragraph.get_text().strip()
    if not text:
        return True
    if "menu-section-heading" in paragraph.get("class", []):
        return True
    if text.startswith("©") or "byline-bio" in paragraph.get("class", []):
        return True
    if paragraph.find('a') and 'href' in str(paragraph):
        return True
    return False

# Function to process HTML content and extract relevant full text
def process_html(content):
    soup = BeautifulSoup(content, 'html.parser')
    paragraphs = soup.find_all('p')
    
    # Filter out irrelevant paragraphs and concatenate
    relevant_paragraphs = [str(p) for p in paragraphs if not is_irrelevant_paragraph(p)]
    full_text = ''.join(relevant_paragraphs)
    
    return full_text

# Creating a new dataframe for cleaned data
cleaned_data = []

for index, row in news_articles.iterrows():
    full_text = process_html(row["content"])
    cleaned_data.append({
        "link": row["link"],
        "title": row["title"],
        "content": full_text
    })

cleaned_articles = pd.DataFrame(cleaned_data)

# Output the cleaned dataframe
display(cleaned_articles.head(3))  # Prints first few rows of the dataframe


Unnamed: 0,link,title,content
0,https://www.today.com/news/onewheel-electric-s...,Onewheel electric skateboards recalled after 4...,"<p class="""">The self-balancing skateboards, ma..."
1,https://www.today.com/news/powerball-no-winner...,"Powerball draw produces no winners, pushing ja...","<p class="""">Saturday night’s Powerball lottery..."
2,https://www.today.com/popculture/travis-kelce-...,Travis Kelce and Chiefs hilariously trade Swif...,"<p class="""">“Travis is so proud,” one fan comm..."
3,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,"<p class="""">October is the start of spooky sea..."
4,https://www.today.com/parents/pregnancy/chromh...,Woman’s sweat turned blue during pregnancy: ‘I...,"<p class="""">When Keisha Sethi became pregnant ..."


In [23]:
''' load the same tokenizer as the one used to train the model'''

tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")

In [24]:
''' define a function that takes a text as input and returns the sentiment label and score
if the text is too long, it will be truncated to 500 tokens'''

def get_sentiment(text):
    encoded_input = tokenizer(
        text, 
        truncation=True, 
        max_length=500, 
        return_tensors=None
    )
    truncated_text = tokenizer.decode(encoded_input["input_ids"])
    sentiment_check = pipeline(
        model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
        top_k=3
    )
    sentiment_result = sentiment_check(truncated_text)

    # Extract the first label and score from the first list
    sentiment_label = sentiment_result[0][0]['label']
    sentiment_score = sentiment_result[0][0]['score']
    return sentiment_label, sentiment_score

In [51]:
''' For testing purposes create a short dataframe. Otherwise news_short = news
Assign the sentiment and score to the news_short then filter only the positive news as a new DataFrame'''

# display progress bar
from tqdm import tqdm

# Create a short dataframe for testing purposes
articles_short = cleaned_articles.copy().head(20)

# Create a new column for sentiment and score and apply the get_sentiment function with tqdm
tqdm.pandas()
articles_short[['sentiment', 'score']] = articles_short['content'].progress_apply(lambda x: pd.Series(get_sentiment(x)))

# Filter only the positive news as a new DataFrame that have a score higher than 0.5
positive_news = articles_short[(articles_short['sentiment'] == 'positive') & (articles_short['score'] > 0.5)]

100%|██████████| 20/20 [00:48<00:00,  2.43s/it]


In [54]:
positive_news.sort_values(by=['score'], ascending=False).head(3)

Unnamed: 0,link,title,content,sentiment,score
8,https://www.today.com/food/creative-coffee-dri...,Creative coffee drinks for any time of day,"<p class=""""><em>(Sponsored by Folgers.)</em></...",positive,0.918229
5,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,"<p class="""">Are you one of the 550,000 Start T...",positive,0.795192
16,https://www.today.com/life/holidays/celebrity-...,The stars go all out for Halloween 2023,"<p class="""">This year, the spooky Oct. 31 holi...",positive,0.783131


In [53]:
import pandas as pd
from bs4 import BeautifulSoup

# Assuming the DataFrame is named 'positive_news' and the relevant column is 'content'
# positive_news = pd.read_csv('your_file.csv') # Uncomment if reading from a CSV file

# Function to split content into paragraphs
def split_into_paragraphs(content):
    soup = BeautifulSoup(content, 'html.parser')
    return [p.get_text().strip() for p in soup.find_all('p') if p.get_text().strip()]

# Splitting each content into paragraphs and creating a new dataframe
paragraphs_data = []

for index, row in positive_news.iterrows():
    paragraphs = split_into_paragraphs(row["content"])
    for i, para in enumerate(paragraphs):
        paragraphs_data.append({
            "link": row["link"],
            "title": row["title"],
            "paragraph_number": i + 1,
            "content": para
        })

paragraphs_df = pd.DataFrame(paragraphs_data)

# Output the new dataframe with separated paragraphs
display(paragraphs_df.head(3))  # Prints first few rows of the dataframe


Unnamed: 0,link,title,paragraph_number,content
0,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,1,"October is the start of spooky season, so one ..."
1,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,2,"On Oct. 1, Applebee’s announced its world-famo..."
2,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,3,"Starting today, guests who are 21+ can enjoy t..."
3,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,4,“After more than three years of listening to g...
4,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,5,Because of the Dollarita appearing like a blue...
5,https://www.today.com/food/news/applebees-doll...,Applebee’s brings back fan-favorite menu item ...,6,"The first, called Dracula’s Juice, is a lemon-..."
6,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,1,"Are you one of the 550,000 Start TODAY members..."
7,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,2,You may be contacted for a future segment or d...
8,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,3,Happy walking!
9,https://www.today.com/health/diet-fitness/star...,We want to share your Start TODAY walking phot...,4,The Start TODAY team
