SENTIMENT ANALYSIS

We will use the VADER library (included in the nltk library) to compute a sentiment score for each headlines of a stock. VADER is lexicon-based so it uses keywords to give a score.
We use NVIDIA here because there are lots of news on this company nowadays.
We get our news from finviz website because it aggregates news from different media sources (YF, Reuters, CNBC..) and it is free.

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import plotly.express as px
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests

In [2]:
#We use finviz website because it aggregates the news from different medias (reuteurs, cnbc...) and it is free
finviz_url = 'https://finviz.com/quote.ashx?t='

Step 1: Webscrapping the finviz website, parsing the html response

In [3]:
def get_news(ticker):
    url1 = finviz_url + ticker
    req = requests.get(url1, headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"})
    
    # Creating the Soup with the html response
    html = BeautifulSoup(req.content, "html.parser")
    
    # Find 'news-table' in the html Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    return news_table

ticker = 'NVDA'
news_table = get_news(ticker)
news_table # print the pure html code, data is not parsed yet


<table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer news-table" id="news-table" width="100%">
<tr class="cursor-pointer has-label" onclick="trackAndOpenNews(event, 'Motley Fool', 'https://www.fool.com/investing/2025/03/23/2-magnificent-seven-stocks-to-buy-on-the-dip/?source=finviz\u0026utm_source=finviz-host-full\u0026utm_medium=feed\u0026utm_campaign=article\u0026referring_guid=3bdc1c1c-5f8c-4449-b8eb-82026ed8ac72');">
<td align="right" width="130">
            Today 04:30AM
        </td>
<td align="left">
<div class="news-link-container">
<div class="news-link-left">
<a class="tab-link-news" href="https://www.fool.com/investing/2025/03/23/2-magnificent-seven-stocks-to-buy-on-the-dip/?source=finviz&amp;utm_source=finviz-host-full&amp;utm_medium=feed&amp;utm_campaign=article&amp;referring_guid=3bdc1c1c-5f8c-4449-b8eb-82026ed8ac72" rel="nofollow" target="_blank">2 "Magnificent Seven" Stocks to Buy on the Dip</a>
</div>
<div class="news-link-right">
<span>(Motley

In [4]:
#Need to Parse the pure html text above

def parse_news_table(news_table):
    # Find all rows in the table. tr = table row in html
    rows = news_table.find_all('tr')
    
    dates = []
    headlines = []
    sources = []
    links = []
    
    # Loop through each row and extract the relevant data
    for row in rows:
        # Get the date and time. td = table data in html
        date_time = row.find('td', align='right')
        if date_time:
            dates.append(date_time.text.strip())
        
        # Get the headline and link
        headline_tag = row.find('a', class_='tab-link-news')
        if headline_tag:
            headlines.append(headline_tag.text.strip())
            links.append(headline_tag['href'])
        
        # Get the source (YF, Reuters...) div = genereic container in html, used for grouping content
        source_tag = row.find('div', class_='news-link-right')
        if source_tag:
            sources.append(source_tag.text.strip())
    
    # Create a DataFrame from the lists
    news_df = pd.DataFrame({
        'datetime': dates,
        'Headline': headlines,
        'Source': sources,
        'Link': links
    })

    # Convert datetime to proper format
    news_df['datetime'] = pd.to_datetime(news_df['datetime'], errors='coerce')

    return news_df

# Example usage:
ticker = 'NVDA'
news_table = get_news(ticker)
parsed_news_df = parse_news_table(news_table)

# Display the parsed DataFrame
print(parsed_news_df.head())

             datetime                                           Headline  \
0                 NaT     2 "Magnificent Seven" Stocks to Buy on the Dip   
1 2025-03-23 03:55:00  Palantir Stock vs. Nvidia Stock: Wall Street S...   
2 2025-03-23 03:02:00  Broadcom: Trillion Dollar Timeout (Rating Down...   
3 2025-03-23 02:06:00           Nvidia: Ignore The Noise And Buy The Dip   
4 2025-03-22 20:55:00  Is NVIDIA Corporation (NVDA) The Best Stock to...   

             Source                                               Link  
0     (Motley Fool)  https://www.fool.com/investing/2025/03/23/2-ma...  
1     (Motley Fool)  https://www.fool.com/investing/2025/03/23/pala...  
2    (SeekingAlpha)  https://seekingalpha.com/article/4769739-broad...  
3    (SeekingAlpha)  https://seekingalpha.com/article/4769736-nvidi...  
4  (Insider Monkey)  https://www.insidermonkey.com/blog/is-nvidia-c...  


  news_df['datetime'] = pd.to_datetime(news_df['datetime'], errors='coerce')


Step 2: Using the VADER sentiment analyzer to give a score to each headlines

In [5]:
def score_news(parsed_news_df):
    #Creating an instance of the vader sentiment analyzer
    vader = SentimentIntensityAnalyzer()

    # Iterate through the headlines and get the polarity / sentiment scores using vader
    scores = parsed_news_df['Headline'].apply(vader.polarity_scores).tolist()

    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)

    # Join the DataFrames of the news and the list of dicts (scores)
    parsed_and_scored_news = parsed_news_df.join(scores_df, rsuffix='-right')
    parsed_and_scored_news = parsed_and_scored_news.set_index('datetime')

    # Remove 'date' and 'time' columns only if they exist
    for col in ['date', 'time']:
        if col in parsed_and_scored_news.columns:
            parsed_and_scored_news = parsed_and_scored_news.drop([col], axis=1)

    # Rename the sentiment column to 'sentiment_score'
    parsed_and_scored_news = parsed_and_scored_news.rename(columns={"compound": "sentiment_score"})

    return parsed_and_scored_news

# Example usage:
parsed_and_scored_news = score_news(parsed_news_df)
parsed_and_scored_news.head()


Unnamed: 0_level_0,Headline,Source,Link,neg,neu,pos,sentiment_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NaT,"2 ""Magnificent Seven"" Stocks to Buy on the Dip",(Motley Fool),https://www.fool.com/investing/2025/03/23/2-ma...,0.0,0.642,0.358,0.5994
2025-03-23 03:55:00,Palantir Stock vs. Nvidia Stock: Wall Street S...,(Motley Fool),https://www.fool.com/investing/2025/03/23/pala...,0.0,1.0,0.0,0.0
2025-03-23 03:02:00,Broadcom: Trillion Dollar Timeout (Rating Down...,(SeekingAlpha),https://seekingalpha.com/article/4769739-broad...,0.0,1.0,0.0,0.0
2025-03-23 02:06:00,Nvidia: Ignore The Noise And Buy The Dip,(SeekingAlpha),https://seekingalpha.com/article/4769736-nvidi...,0.263,0.737,0.0,-0.3612
2025-03-22 20:55:00,Is NVIDIA Corporation (NVDA) The Best Stock to...,(Insider Monkey),https://www.insidermonkey.com/blog/is-nvidia-c...,0.0,0.756,0.244,0.6369


In [6]:
print(parsed_and_scored_news['sentiment_score'])

datetime
NaT                    0.5994
2025-03-23 03:55:00    0.0000
2025-03-23 03:02:00    0.0000
2025-03-23 02:06:00   -0.3612
2025-03-22 20:55:00    0.6369
                        ...  
2025-03-23 06:09:00   -0.1280
2025-03-23 06:09:00    0.4767
2025-03-23 06:02:00   -0.3818
2025-03-23 05:45:00    0.5574
2025-03-23 05:15:00    0.0000
Name: sentiment_score, Length: 100, dtype: float64


Step 3 : Visualizing the data

In [7]:
def plot_sentiment(parsed_and_scored_news, ticker):
    # Ensure that 'datetime' is the index
    if 'datetime' not in parsed_and_scored_news.index.name:
        parsed_and_scored_news = parsed_and_scored_news.set_index('datetime')

    # Check if sentiment_score exists and is numeric
    if 'sentiment_score' in parsed_and_scored_news.columns:
        # Convert sentiment_score to numeric. If any error, coerce = transform the value to NaN
        parsed_and_scored_news['sentiment_score'] = pd.to_numeric(parsed_and_scored_news['sentiment_score'], errors='coerce')
    else:
        print("Error: 'sentiment_score' column not found.")

    # Check if there are any NaN values and remove them
    if parsed_and_scored_news['sentiment_score'].isnull().any():
        print("Warning: There are NaN values in sentiment_score.")
        parsed_and_scored_news = parsed_and_scored_news.dropna(subset=['sentiment_score'])

    # Hourly sentiment calculation
    parsed_and_scored_news['hour'] = parsed_and_scored_news.index.hour
    hourly_mean = parsed_and_scored_news.groupby('hour')['sentiment_score'].mean()

    # Daily sentiment calculation
    parsed_and_scored_news['date'] = parsed_and_scored_news.index.date
    daily_mean = parsed_and_scored_news.groupby('date')['sentiment_score'].mean()

    # Plotting both Hourly and Daily Sentiment Scores
    # Hourly chart
    fig1 = px.bar(hourly_mean, x=hourly_mean.index, y=hourly_mean, title=f'{ticker} Hourly Sentiment Scores', labels={'hour': 'Hour of Day', 'sentiment_score': 'Mean Sentiment Score'})

    # Daily chart
    fig2 = px.bar(daily_mean, x=daily_mean.index, y=daily_mean, title=f'{ticker} Daily Sentiment Scores', labels={'date': 'Date', 'sentiment_score': 'Mean Sentiment Score'})

    # Display both charts
    fig1.show()
    fig2.show()

# Example usage
plot_sentiment(parsed_and_scored_news, 'NVDA')
