## 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers




In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

## 2. Setup Summarization Model


In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Summarize a Single Article


In [10]:
url = "https://au.finance.yahoo.com/news/insurance-warning-for-aussies-wanting-to-snap-up-new-chinese-ev-brands-theres-a-catch-190046746.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [13]:
paragraphs[1].text


'Zeekr, Jac, Deepal. You may think these names are foreign, but they’ll soon be sitting alongside Toyotas, Hyundais and Mazdas in traffic.'

In [14]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [15]:
ARTICLE


"We are experiencing some temporary issues. The market data on this page is currently delayed. Please bear with us as we address this and restore your personalised lists. Zeekr, Jac, Deepal. You may think these names are foreign, but they’ll soon be sitting alongside Toyotas, Hyundais and Mazdas in traffic. At least seven new Chinese car brands have been confirmed to launch in 2025 – or are already available – in Australia. Following the success of MG, GWM and BYD, the Chinese automotive industry has clearly seen an opportunity to offer more competitively priced, value-packed, and high-tech new cars Down Under – whether powered by petrol, diesel or electricity. But what’s the catch? $1,100 cost-of-living relief for Centrelink recipients and other Aussies ‘doing it tough’ Major EV win as Aussies can earn $12,000 from their electric car: 'Free money' $1,300 cash boost for Woolworths, Coles shoppers as major class action lawsuit launched Aside from the sharp price tags, big screens and fa

In [16]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [17]:
summary


'Chinese carmakers to launch in Australia in 2025.'

## 4. Building a News and Sentiment Pipeline


In [18]:
monitored_tickers = ['GME', 'TSLA', 'BTC']


## 4.1. Search for Stock News using Google and Yahoo Finance


In [19]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [20]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQOwgC',
  '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=nws&gbv=1&sei=JlQ4Z-znM77Sp84P-qi8gQI',
  '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUIBygC',
  '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUICCgD',
  '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQiaAMCAkoBA&usg=AOvVaw2EyEnWSoqsqB1eLb-UHLvO',
  '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3Dab1b28e242c283c4%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyA

In [21]:
raw_urls['GME']


['/?sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQOwgC',
 '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=nws&gbv=1&sei=JlQ4Z-znM77Sp84P-qi8gQI',
 '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUIBSgA',
 '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUIBygC',
 '/search?q=yahoo+finance+GME&sca_esv=ab1b28e242c283c4&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQ_AUICCgD',
 '/url?q=https://maps.google.com/maps%3Fq%3Dyahoo%2Bfinance%2BGME%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQiaAMCAkoBA&usg=AOvVaw2EyEnWSoqsqB1eLb-UHLvO',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2BGME%26sca_esv%3Dab1b28e242c283c4%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwjskreQtOCJAxU-6ckDHXoULyAQiaAMCAooBQ&us

## 4.2. Strip out unwanted URLs


In [22]:
import re


In [23]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']


In [24]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [25]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-gme-falls-more-steeply-224520779.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/why-diamond-hill-long-short-122844695.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-drops-despite-214519430.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-dipped-more-214520142.html',
  'https://finance.yahoo.com/news/heres-why-gamestop-gme-fell-224520124.html',
  'https://finance.yahoo.com/news/gamestop-gme-registers-bigger-fall-214521016.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-moves-0-215522235.html',
  'https://finance.yahoo.com/news/investors-heavily-search-gamestop-corp-130016370.html',
  'https://finance.yahoo.com/news/jim-cramer-says-gamestop-gme-134244137.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-moves-1-214524513.html'],
 'TSLA': ['https://finance.yahoo.com/video/hedge-funds-shorting-tesla-stock-145155395.html'

## 4.3. Search and Scrape Cleaned URLs


In [26]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [27]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["We are experiencing some temporary issues. The market data on this page is currently delayed. Please bear with us as we address this and restore your personalized lists. GameStop (GME) closed the latest trading day at $22.33, indicating a -0.36% change from the previous session's end. This change lagged the S&P 500's 0.28% loss on the day. Meanwhile, the Dow lost 0.61%, and the Nasdaq, a tech-heavy index, lost 0.33%. Heading into today, shares of the video game retailer had gained 4.72% over the past month, outpacing the Consumer Discretionary sector's gain of 2.26% and the S&P 500's gain of 0.41% in that time. Market participants will be closely following the financial results of GameStop in its upcoming release. At the same time, our most recent consensus estimate is projecting a revenue of $900 million, reflecting a 16.54% fall from the equivalent quarter last year. Regarding the entire year, the Zacks Consensus Estimates forecast earnings of $0.01 per share and revenue of

In [32]:
articles['TSLA'][9]


'We are experiencing some temporary issues. The market data on this page is currently delayed. Please bear with us as we address this and restore your personalized lists. Earnings season is slowly winding down, with just a small chunk of S&P 500 companies yet to reveal their quarterly results. The period has remained positive, with the growth trend expected to continue in the coming periods. Stay up-to-date with all quarterly releases: See Zacks Earnings Calendar. And throughout the period, several popular companies, including Advanced Micro Devices AMD and Tesla TSLA – enjoyed a profitability boost, seeing their margins improve on a year-over-year basis. Let’s take a closer look at the releases. \xa0 Concerning headline figures in the release, AMD posted a 1.1% beat relative to the Zacks Consensus EPS estimate and reported sales 1.5% ahead of expectations, reflecting growth rates of 31% and 17%, respectively. Margin expansion aided the results in a big way, with a gross margin of 54% 

## 4.4. Summarise all Articles


In [33]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [34]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Video game retailer is scheduled to report earnings later today.',
  'Your information may be shared with third parties.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Video game retailer is scheduled to report earnings later on Wednesday.',
  'Video game retailer is set to report earnings later today.',
  'Video game retailer is scheduled to report earnings later on Thursday.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.'],
 'TSLA': ['We are aware of the issue and are working to resolve it.',
  'OpenAI to release fourth generation of ChatGPT model. Tesla, Perplexity and Google are also in the spotlight',
  'We are aware of the issue and are working to resolve it.',
  'Tesla has a debt-to-cash rat

In [35]:
summaries['BTC']


['On CryptosRus, George Tung explores U.S. plan to buy up to 5% of supply. Bitcoin scarcity, U.S. buying, and more',
 'We are aware of the issue and are working to resolve it.',
 'Digital currency ‘in blue sky territory,’ VanEck’s Sigel says. ‘We’re likely to make repeated all-time highs,’ he says',
 'Analysts have been predicting a price of $100,000 before Christmas.',
 "Coin could reach half of gold's market cap, van Eck says. Individual investors are flocking to Bitcoin ETFs",
 'We are aware of the issue and are working to resolve it.',
 'The Himalayan nation has been diversifying its cryptocurrency holdings. Bhutan is fifth among countries with Bitcoin holdings',
 'We are aware of the issue and are working to resolve it.',
 'Your information may be shared with third parties.',
 'Ethereum, Aave, and Dogecoin all have potential for upside.',
 'The world’s largest cryptocurrency has more than doubled this year. Ethereum, solana and dogecoin also rise on Wednesday']

## 5. Adding Sentiment Analysis


In [36]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [37]:
sentiment(summaries['BTC'])


[{'label': 'POSITIVE', 'score': 0.9878144264221191},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.8589006066322327},
 {'label': 'NEGATIVE', 'score': 0.9923264980316162},
 {'label': 'NEGATIVE', 'score': 0.958739161491394},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9852761030197144},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9903545379638672},
 {'label': 'POSITIVE', 'score': 0.8397566676139832},
 {'label': 'POSITIVE', 'score': 0.998205304145813}]

In [38]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9656205177307129},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9778563380241394},
  {'label': 'NEGATIVE', 'score': 0.974228024482727},
  {'label': 'NEGATIVE', 'score': 0.9768379926681519},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9967484474182129},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9946510195732117},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9948047995567322},
  {'label': 'POSITIVE', 'score': 0.9634703993797302},
  {'label': '

In [39]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])


We are aware of the issue and are working to resolve it. POSITIVE 0.9979088306427002


In [None]:
scores['BTC'][0]['score']


## 6. Exporting Results to CSV


In [41]:
summaries


{'GME': ['Video game retailer is scheduled to report earnings later today.',
  'Your information may be shared with third parties.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Video game retailer is scheduled to report earnings later on Wednesday.',
  'Video game retailer is set to report earnings later today.',
  'Video game retailer is scheduled to report earnings later on Thursday.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.'],
 'TSLA': ['We are aware of the issue and are working to resolve it.',
  'OpenAI to release fourth generation of ChatGPT model. Tesla, Perplexity and Google are also in the spotlight',
  'We are aware of the issue and are working to resolve it.',
  'Tesla has a debt-to-cash rat

In [42]:
scores


{'GME': [{'label': 'NEGATIVE', 'score': 0.9656205177307129},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9778563380241394},
  {'label': 'NEGATIVE', 'score': 0.974228024482727},
  {'label': 'NEGATIVE', 'score': 0.9768379926681519},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9967484474182129},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9946510195732117},
  {'label': 'NEGATIVE', 'score': 0.9903545379638672},
  {'label': 'POSITIVE', 'score': 0.9948047995567322},
  {'label': 'POSITIVE', 'score': 0.9634703993797302},
  {'label': '

In [43]:
cleaned_urls


{'GME': ['https://finance.yahoo.com/news/gamestop-gme-falls-more-steeply-224520779.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/why-diamond-hill-long-short-122844695.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-drops-despite-214519430.html',
  'https://finance.yahoo.com/news/why-gamestop-gme-dipped-more-214520142.html',
  'https://finance.yahoo.com/news/heres-why-gamestop-gme-fell-224520124.html',
  'https://finance.yahoo.com/news/gamestop-gme-registers-bigger-fall-214521016.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-moves-0-215522235.html',
  'https://finance.yahoo.com/news/investors-heavily-search-gamestop-corp-130016370.html',
  'https://finance.yahoo.com/news/jim-cramer-says-gamestop-gme-134244137.html',
  'https://finance.yahoo.com/news/gamestop-gme-stock-moves-1-214524513.html'],
 'TSLA': ['https://finance.yahoo.com/video/hedge-funds-shorting-tesla-stock-145155395.html'

In [44]:
range(len(summaries['GME']))


range(0, 11)

In [52]:
summaries['GME'][6]


'Video game retailer is scheduled to report earnings later on Thursday.'

In [46]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [47]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Video game retailer is scheduled to report earnings later today.',
  'NEGATIVE',
  0.9656205177307129,
  'https://finance.yahoo.com/news/gamestop-gme-falls-more-steeply-224520779.html'],
 ['GME',
  'Your information may be shared with third parties.',
  'NEGATIVE',
  0.9903545379638672,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/why-diamond-hill-long-short-122844695.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/gamestop-gme-stock-drops-despite-214519430.html'],
 ['GME',
  'Video game retailer is scheduled to report earnings later on Wednesday.',
  'NEGATIVE',
  0.9778563380241394,
  'https://finance.yahoo.com/news/why-gamestop-gme-dipped-more-214520142.html'],
 ['GME',
  'Video game retailer i

In [48]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])


In [49]:
final_output


[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Video game retailer is scheduled to report earnings later today.',
  'NEGATIVE',
  0.9656205177307129,
  'https://finance.yahoo.com/news/gamestop-gme-falls-more-steeply-224520779.html'],
 ['GME',
  'Your information may be shared with third parties.',
  'NEGATIVE',
  0.9903545379638672,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/why-diamond-hill-long-short-122844695.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/gamestop-gme-stock-drops-despite-214519430.html'],
 ['GME',
  'Video game retailer is scheduled to report earnings later on Wednesday.',
  'NEGATIVE',
  0.9778563380241394,
  'https://finance.yahoo.com/news/why-gamestop-gme-dipped-mo

In [50]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)