In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Setup Summarization Model

In [3]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

### Summarization of Articles

In [4]:
url = "https://finance.yahoo.com/news/jpmorgan-says-stocks-suffer-150-161230891.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [5]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [6]:
ARTICLE

'(Bloomberg) -- The relentless rally in equities faces a fresh threat over the next few weeks with the world’s biggest money managers set to unload as much as $150 billion of stocks. Most Read from Bloomberg Social Security Benefits Targeted for Cuts by House Conservatives A Goldman Partner’s Sexually Explicit Video Led to Millions in Settlement Stock Rally Is Deepening Beyond AI-Fueled Craze: Markets Wrap US Submarine Damaged in South China Sea Won’t Return Until 2026 as Shipyards Are Clogged Biggest Losers of AI Boom Are Knowledge Workers, McKinsey Says JPMorgan Chase & Co. projects real-money portfolios, including those of sovereign wealth and pension funds, will tilt back in favor of bonds to meet allocation targets, in the largest rebalancing flows to the asset class since the fourth quarter of 2021. The periodic rejigging could knock off as much as 5% from the price of global stocks, according to estimates by JPMorgan strategist Nikolaos Panigirtzoglou. Pension funds and other in

In [7]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens = True)

In [8]:
summary

'Japan’s GPIF could sell $37 billion of stocks to meet targets. SNB could sell $11 billion of equities to meet targets'

### Building a News and Sentiment Pipeline

In [9]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

#### 1) Searching for Stock News using Google and Yahoo Finance

In [10]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [11]:
raw_urls = {}
for ticker in monitored_tickers:
    raw_urls[ticker] = search_for_stock_news_urls(ticker)

In [12]:
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=eP-NZObwNYeY2roP6umeIA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwjmjMm7-8r_AhUHjFYBHeq0BwQQpwUIDQ',
  '/search?q=yahoo+finance+GME&ie=UT

#### 2) Remove Unwanted URLs

In [13]:
import re

In [14]:
exclude_list = ['maps', 'policies', 'support', 'accounts', 'preferences']

In [15]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [16]:
cleaned_urls = {}
for ticker in monitored_tickers:
    cleaned_urls[ticker] = strip_unwanted_urls(raw_urls[ticker], exclude_list)

In [17]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-discloses-first-quarter-2023-201500165.html',
  'https://finance.yahoo.com/news/gen-z-roi-mba-more-181935654.html',
  'https://finance.yahoo.com/news/midday-movers-affirm-stitch-fix-141723392.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/12-best-short-squeeze-stocks-120213885.html',
  'https://finance.yahoo.com/news/invesco-p-midcap-400-equal-102008681.html',
  'https://ca.finance.yahoo.com/news/gamestop-stock-tanks-after-company-fires-ceo-123910339.html',
  'https://finance.yahoo.com/news/debt-ceiling-done-fed-goes-quiet-what-to-watch-this-week-130054402.html',
  'https://finance.yahoo.com/news/an-open-letter-to-gamestop-executive-chairman-ryan-cohen-morning-brief-100037724.html',
  'https://finance.yahoo.com/news/stocks-rise-sp-500-enters-new-bull-market-stock-market-news-today-200316428.html',
  'https://www.thestreet.com/retail/meme-stock-gamestop-just-di

#### 3) Search and Scrape through all the Cleaned URLS above

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [19]:
articles = {}
for ticker in monitored_tickers:
    articles[ticker] = scrape_and_process(cleaned_urls[ticker])

In [20]:
articles

{'GME': ['GRAPEVINE, Texas, June 07, 2023--(BUSINESS WIRE)--GameStop Corp. (NYSE: GME) ("GameStop" or the "Company") today released financial results for the first quarter ended April 29, 2023. The Company’s condensed and consolidated financial statements, including GAAP and non-GAAP results, are below. The Company’s Form 10-Q and supplemental information can be found at https://investor.gamestop.com. FIRST QUARTER OVERVIEW Net sales were $1.237 billion for the period, compared to $1.378 billion in the prior year\'s first quarter. Selling, general and administrative ("SG&A") expenses were $345.7 million, or 27.9% of net sales for the period, compared to $452.2 million, or 32.8% of net sales, in the prior year\'s first quarter. Net loss was $50.5 million for the period, compared to a net loss of $157.9 million for the prior year’s first quarter. Transition costs related to European restructuring efforts were $14.5 million for the period. For the second quarter, the Company will continue

In [21]:
def summarize(articles):
    summaries = []
    for ARTICLE in articles:
        input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
        output = model.generate(input_ids, max_length = 55, num_beams = 5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens = True)
        summaries.append(summary)
    return summaries

In [22]:
summaries = {}
for ticker in monitored_tickers:
    summaries[ticker] = summarize(articles[ticker])

In [23]:
summaries

{'GME': ['Net loss was $50.5 million for the period, compared to a loss of $157.9 million for the prior year’s first quarter.',
  'Report tries to nail down what candidates want from an MBA. More than half of Gen Z candidates applying to MBA programs.',
  'Affirm, Stitch Fix, Campbell Soup report better-than-expected results.',
  'All images are copyrighted.',
  'We are aware of the issue and are working to resolve it.',
  'EWMC seeks to replicate the performance of the S&P MidCap 400 Equal Weight Index.',
  'We are aware of the issue and are working to resolve it.',
  'Apple’s Worldwide Developers Conference on Monday. Earnings will continue to wind down with results from meme stock favorite GameStop',
  'Ryan, I’m writing to express frustration with your management team and business.',
  "Investors digested fresh economic data ahead of next week's Fed meeting.",
  '.'],
 'TSLA': ['We are aware of the issue and are working to resolve it.',
  'All images are copyrighted.',
  'Shares of

In [24]:
summaries['TSLA']

['We are aware of the issue and are working to resolve it.',
 'All images are copyrighted.',
 'Shares of smaller EV rivals have rallied in recent days. Tesla has seen its market cap move back over $800 billion',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Tesla had sought to overturn ban on direct car sales. Claim was part of electric carmaker’s strategy to overturn bans',
 '‘It’s going to be a boom,’ O’Leary says of new truck. Model could add 10% to 12% to Tesla’s margins',
 'EV maker has added $194 billion to market valuation in 11 days. GM’s decision to adapt Superchargers to Tesla’s network',
 'Raw demand to travel has been strong for Memorial Day weekend and now into July.',
 'Retail traders are dumping EV stocks, Vanda says. Musk has long been vocal about the dangers of AI',
 'All versions of the Model 3 now qualify for full federal EV tax credit. Tesla Model 3 RWD version now costs $32,740 with full 

### Adding Sentiment Analysis Pipeline

In [25]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.0.1+cpu)
    Python  3.9.13 (you have 3.9.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [26]:
!pip install xformers





In [27]:
scores = {}
for ticker in monitored_tickers:
    scores[ticker] = sentiment(summaries[ticker])

In [28]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9993355870246887},
  {'label': 'NEGATIVE', 'score': 0.9983149766921997},
  {'label': 'POSITIVE', 'score': 0.9971166849136353},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9927782416343689},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.999107301235199},
  {'label': 'NEGATIVE', 'score': 0.9974145889282227},
  {'label': 'POSITIVE', 'score': 0.9027295708656311},
  {'label': 'POSITIVE', 'score': 0.9668781757354736}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.6760101914405823},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9802840352058411},
  {'label': 'POSITIVE', 'score': 0.9913261532783508},
  {'label': '

In [29]:
print(summaries['TSLA'][0], scores['TSLA'][0]['label'], scores['TSLA'][0]['score'])

We are aware of the issue and are working to resolve it. POSITIVE 0.9979088306427002


### Exporting Results to CSV

In [30]:
summaries

{'GME': ['Net loss was $50.5 million for the period, compared to a loss of $157.9 million for the prior year’s first quarter.',
  'Report tries to nail down what candidates want from an MBA. More than half of Gen Z candidates applying to MBA programs.',
  'Affirm, Stitch Fix, Campbell Soup report better-than-expected results.',
  'All images are copyrighted.',
  'We are aware of the issue and are working to resolve it.',
  'EWMC seeks to replicate the performance of the S&P MidCap 400 Equal Weight Index.',
  'We are aware of the issue and are working to resolve it.',
  'Apple’s Worldwide Developers Conference on Monday. Earnings will continue to wind down with results from meme stock favorite GameStop',
  'Ryan, I’m writing to express frustration with your management team and business.',
  "Investors digested fresh economic data ahead of next week's Fed meeting.",
  '.'],
 'TSLA': ['We are aware of the issue and are working to resolve it.',
  'All images are copyrighted.',
  'Shares of

In [31]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9993355870246887},
  {'label': 'NEGATIVE', 'score': 0.9983149766921997},
  {'label': 'POSITIVE', 'score': 0.9971166849136353},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9927782416343689},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.999107301235199},
  {'label': 'NEGATIVE', 'score': 0.9974145889282227},
  {'label': 'POSITIVE', 'score': 0.9027295708656311},
  {'label': 'POSITIVE', 'score': 0.9668781757354736}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.6760101914405823},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9802840352058411},
  {'label': 'POSITIVE', 'score': 0.9913261532783508},
  {'label': '

In [33]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-discloses-first-quarter-2023-201500165.html',
  'https://finance.yahoo.com/news/gen-z-roi-mba-more-181935654.html',
  'https://finance.yahoo.com/news/midday-movers-affirm-stitch-fix-141723392.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/news/12-best-short-squeeze-stocks-120213885.html',
  'https://finance.yahoo.com/news/invesco-p-midcap-400-equal-102008681.html',
  'https://ca.finance.yahoo.com/news/gamestop-stock-tanks-after-company-fires-ceo-123910339.html',
  'https://finance.yahoo.com/news/debt-ceiling-done-fed-goes-quiet-what-to-watch-this-week-130054402.html',
  'https://finance.yahoo.com/news/an-open-letter-to-gamestop-executive-chairman-ryan-cohen-morning-brief-100037724.html',
  'https://finance.yahoo.com/news/stocks-rise-sp-500-enters-new-bull-market-stock-market-news-today-200316428.html',
  'https://www.thestreet.com/retail/meme-stock-gamestop-just-di

In [42]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [43]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Net loss was $50.5 million for the period, compared to a loss of $157.9 million for the prior year’s first quarter.',
  'NEGATIVE',
  0.9993355870246887,
  'https://finance.yahoo.com/news/gamestop-discloses-first-quarter-2023-201500165.html'],
 ['GME',
  'Report tries to nail down what candidates want from an MBA. More than half of Gen Z candidates applying to MBA programs.',
  'NEGATIVE',
  0.9983149766921997,
  'https://finance.yahoo.com/news/gen-z-roi-mba-more-181935654.html'],
 ['GME',
  'Affirm, Stitch Fix, Campbell Soup report better-than-expected results.',
  'POSITIVE',
  0.9971166849136353,
  'https://finance.yahoo.com/news/midday-movers-affirm-stitch-fix-141723392.html'],
 ['GME',
  'All images are copyrighted.',
  'NEGATIVE',
  0.9880996346473694,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.co

In [44]:
final_output.insert(0,['Ticker', 'Summary', 'Label', 'Confindence', 'URL'])

In [45]:
final_output

[['Ticker', 'Summary', 'Label', 'Confindence', 'URL'],
 ['GME',
  'Net loss was $50.5 million for the period, compared to a loss of $157.9 million for the prior year’s first quarter.',
  'NEGATIVE',
  0.9993355870246887,
  'https://finance.yahoo.com/news/gamestop-discloses-first-quarter-2023-201500165.html'],
 ['GME',
  'Report tries to nail down what candidates want from an MBA. More than half of Gen Z candidates applying to MBA programs.',
  'NEGATIVE',
  0.9983149766921997,
  'https://finance.yahoo.com/news/gen-z-roi-mba-more-181935654.html'],
 ['GME',
  'Affirm, Stitch Fix, Campbell Soup report better-than-expected results.',
  'POSITIVE',
  0.9971166849136353,
  'https://finance.yahoo.com/news/midday-movers-affirm-stitch-fix-141723392.html'],
 ['GME',
  'All images are copyrighted.',
  'NEGATIVE',
  0.9880996346473694,
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BGME%26tbm%3Dnws%26pccc%3D1'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSIT

In [49]:
import csv
with open('assetsummaries.csv', "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(final_output)