In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline
from dateutil.parser import parse
import pytz
import datetime
# Define the name of the pre-trained Pegasus model for financial summarization
model_name = "human-centered-summarization/financial-summarization-pegasus"

# Initialize the tokenizer for the Pegasus model
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Initialize the pre-trained Pegasus model for conditional text generation
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Define the URL to be used for web scraping financial news related to the US economy
url = "https://www.google.com/search?sxsrf=APwXEdfoORN23DmKxvNwhGnPcYR7KYaVng:1686905788002&q=us+economy&tbm=nws&sa=X&ved=2ahUKEwiWuY3itcf_AhUDkYkEHYCHA2EQ0pQJegQIBxAB&biw=1920&bih=872&dpr=2"

# Send an HTTP GET request to the URL and store the response
r = requests.get(url)

# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

# Extract all paragraph elements from the HTML content
paragraphs = soup.find_all('p')

# Define a list of monitored tickers (in this case, only 'Economy')
monitored_tickers = ['Economy']

# Function to search for news URLs related to a given ticker
def search_for_stock_news_urls(ticker):
    urls = []
    for page in range(5):  # Change the number of pages here
        start_index = page * 10
        search_url = f"https://www.google.com/search?sxsrf=APwXEdfoORN23DmKxvNwhGnPcYR7KYaVng:1686905788002&q=us+economy{ticker}&tbm=nws&start={start_index}"
        r = requests.get(search_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        atags = soup.find_all('a')
        hrefs = [link['href'] for link in atags]
        urls.extend(hrefs)
    return urls

# Create a dictionary that maps each ticker to a list of URLs containing news articles related to that ticker
raw_urls = {ticker: search_for_stock_news_urls(ticker) for ticker in monitored_tickers}

# Define a list of words to exclude from the URLs
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

# Function to clean the URLs by removing unwanted ones based on specific criteria
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

# Create a dictionary that maps each ticker to a list of cleaned URLs
cleaned_urls = {ticker: strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}

# Function to scrape and process the content of the URLs
def scrape_and_process(URLs):
    ARTICLES = []
    for i, url in enumerate(URLs):
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract date and time information from the article
        datetime_element = soup.find('time')
        if datetime_element and 'datetime' in datetime_element.attrs:
            datetime_str = datetime_element.get('datetime')
            article_datetime = parse(datetime_str).replace(tzinfo=None)
        else:
            article_datetime = None

        # Extract paragraphs from the article and limit the text to 350 words
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)

        # Add article and datetime to the list of articles
        ARTICLES.append((f"Article {i+1}", ARTICLE, article_datetime))

    # Sort articles by datetime in ascending order
    ARTICLES.sort(key=lambda x: x[2] if x[2] is not None else datetime.datetime.min)

    return ARTICLES

# Create a dictionary that maps each ticker to a list of processed articles
articles = {ticker: scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

# Initialize an empty list to store filtered articles for each ticker
filtered_articles = []

# Loop over each ticker to filter articles with valid datetime information
for ticker in monitored_tickers:
    ticker_articles = articles[ticker]
    filtered_ticker_articles = []

    for article_number, article, datetime in ticker_articles:
        if datetime is not None:
            filtered_ticker_articles.append((article_number, article, datetime))

    filtered_articles.append(filtered_ticker_articles)

# Function to summarize each article using the Pegasus model
def summarize(articles):
    summaries = []
    for article_number, article, _ in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=10, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summary = summary.replace("We are aware of the issue and are working to resolve it.", "")
        summary = summary.replace("All images are copyrighted.", "")
        summary = summary.replace("Find the best credit cards, loans, insurance and more in SELECT.", "")
        summaries.append(summary)
    return summaries

# Create a list to store summarized articles for each ticker
summarized_articles = []

# Loop over each ticker to summarize its filtered articles
for ticker_articles in filtered_articles:
    summarized_ticker_articles = summarize(ticker_articles)
    summarized_articles.append(summarized_ticker_articles)

# Initialize a sentiment analysis pipeline using the Hugging Face Transformers library
sentiment = pipeline('sentiment-analysis')

# Loop over each ticker to print the summarized articles along with their sentiment analysis results
for ticker, ticker_articles, summarized_ticker_articles in zip(monitored_tickers, filtered_articles, summarized_articles):
    print(f"Ticker: {ticker}")
    for (article_number, article, datetime), summary in zip(ticker_articles, summarized_ticker_articles):
        print(f"Article {article_number}:")
        print(f"Date and Time: {datetime}")
        print(f"Summary: {summary}")
        print()
        sentiment_results = sentiment([summary])
        print("Sentiment Analysis:")
        for result in sentiment_results:
            print(result)
        print()

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Ticker: Economy
Article Article 29:
Date and Time: 2023-06-30 15:34:27
Summary: Weekly take on events in the world economy and their fallout.

Sentiment Analysis:
{'label': 'POSITIVE', 'score': 0.9966251850128174}

Article Article 48:
Date and Time: 2023-07-12 09:38:07
Summary: Headline inflation down over two-thirds since last year. Core inflation has come down much less, but is still high

Sentiment Analysis:
{'label': 'NEGATIVE', 'score': 0.9748467803001404}

Article Article 19:
Date and Time: 2023-07-13 20:06:34
Summary: Director Brainard speaks at Economic Club of New York. Economy is ‘stabilising,’ Director Brainard says

Sentiment Analysis:
{'label': 'POSITIVE', 'score': 0.9926002621650696}

Article Article 31:
Date and Time: 2023-07-14 20:31:49.507000
Summary: Sentiment measure now exceeds pre-pandemic levels. Democrats are more optimistic, while Republicans are less so

Sentiment Analysis:
{'label': 'POSITIVE', 'score': 0.9775083065032959}

Article Article 26:
Date and Time: 2