In [1]:
from bs4 import BeautifulSoup
import requests
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from datetime import datetime
import pandas as pd

In [2]:
def get_text(url):

    response = requests.get(url, headers={'User-Agent': 'Custom'})
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the time element
    time_element = soup.find('time', class_='byline-attr-meta-time')

    if time_element:

        # Get the text from the time element
        time_str = time_element.get_text()

        # Use pd.to_datetime to parse the time string
        try:

            # Convert to datetime and format to 'YYYY-MM-DD'
            formatted_time = pd.to_datetime(time_str).strftime('%Y-%m-%d')

        except ValueError:
            formatted_time = "Invalid date format"

    else:
        formatted_time = None

    # Extract the body text
    body = soup.find('div', class_='body yf-5ef8bf')
    text = body.get_text() if body else ""

    return formatted_time, text

In [3]:
def sentiment(date, text: str) -> dict:
  # Load the FinBERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
    model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get softmax probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()[0]

    # Prepare sentiment scores
    sentiment_scores = {
        'date' : date,
        'positive': probabilities[0],
        'neutral': probabilities[1],
        'negative': probabilities[2]
    }

    # Determine the overall sentiment conclusion
    conclusion = ""

    # Access the sentiment scores directly using their individual keys
    positive_score = sentiment_scores.get('positive', 0)
    neutral_score = sentiment_scores.get('neutral', 0)
    negative_score = sentiment_scores.get('negative', 0)

    if max(positive_score, neutral_score, negative_score) == positive_score:
      conclusion = 'Positive'
    elif max(positive_score, neutral_score, negative_score) == neutral_score:
      conclusion = 'Neutral'
    else:
      conclusion = 'Negative'

    # Add conclusion to the sentiment scores
    sentiment_scores['conclusion'] = conclusion

    return sentiment_scores

In [4]:
def sentiment_analysis(url):
  date, text = get_text(url)
  result = sentiment(date, text)
  return result



# Main function

In [5]:
sentiment_analysis('https://finance.yahoo.com/news/national-security-investigation-launched-15bn-104614465.html')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

{'date': '2024-02-05',
 'positive': 0.35916328,
 'neutral': 0.21665174,
 'negative': 0.42418492,
 'conclusion': 'Negative'}

In [6]:
sentiment_analysis('https://finance.yahoo.com/news/li-ka-shings-ck-hutchison-093000665.html')

{'date': '2024-02-13',
 'positive': 0.35916328,
 'neutral': 0.21665174,
 'negative': 0.42418492,
 'conclusion': 'Negative'}