# 1. Install and Import Baseline Dependancies

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 7.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 13.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [2]:
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 29.7 MB/s 
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.96


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

Downloading tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

# 3. Summarize a Single Article

In [21]:
#@title
url = "https://finance.yahoo.com/news/meta-falls-sales-miss-estimates-201332508.html?.tsrc=fin-srch"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')


In [22]:
paragraphs[0].text

'(Bloomberg) -- Meta Platforms Inc., the social media giant that includes Facebook and Instagram, reported its first-ever quarterly sales decline, citing advertisers’ shrinking budgets.'

In [23]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [24]:
ARTICLE

'(Bloomberg) -- Meta Platforms Inc., the social media giant that includes Facebook and Instagram, reported its first-ever quarterly sales decline, citing advertisers’ shrinking budgets. Most Read from Bloomberg Rockstar Games Cleaned Up Its Frat-Boy Culture — and Grand Theft Auto, Too Fed Hikes 75 Basis Points Second Time, Signals Third Is Possible Biden Considers New Pause on Paying Back Student Loans, $10,000 Relief US Economy Shrinks for a Second Quarter, Fueling Recession Fears Fed Watchers Say Markets Got It All Wrong on Powell ‘Pivot’ Meta revenue slipped to $28.8 billion in the second quarter, missing the $28.9 billion average analyst estimate. The company’s forecast for the current period also fell short. The shares fell as much as 8.7%, the biggest drop in two months. The company’s advertising sales efforts are hitting a number of snags. Marketers are spending less due to various economic pressures, leaving Meta and its peers to compete for the smaller budgets. Apple Inc.’s pr

In [25]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [26]:
summary

'Shares fall as much as 8.7%, the biggest drop in two months. Revenue for the current quarter will be $26 billion to $28.5 billion'

# 4. Building a news and Sentiment Pipeline

In [11]:
monitored_tickers = ['AAPL', 'JPM', 'BTC']

## Searching for news on google and yahoo finance

In [12]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [13]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}

## Strip out unwated URLs

In [14]:
import re

In [15]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [16]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [17]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}

## Search and Scrape Cleaned URLs

In [18]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [19]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

In [20]:
len(articles['JPM'])

10

In [27]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries


In [28]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'AAPL': ['Starbucks to buy back some of its own stock.',
  'All images are copyrighted.',
  'Analysts expect Apple to report slower growth than a year ago. Apple warned in April of a $4 billion to $8 billion revenue hit',
  'The company is the world’s most valuable, with a market value of over $800 billion.',
  'All photographs subject to copyright.',
  'Analysts see smallest revenue increase since 2020 on China supply issues. Apple set for biggest monthly gain in almost two years',
  'Apple says it has contributed more than $1.3 billion to projects.',
  'Second-quarter profit jumped 56% on resilient demand. Says capex for next year to be adjusted significantly',
  'Kim Woo-Pyeong worked for Apple since 2014 after working for Texas Instruments.',
  'Analysts expect iPhone sales to have dropped 2.3% this quarter.'],
 'BTC': ['We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Tesla sold $936 million of the cryptoc

In [29]:
summaries['JPM']

['JPMorgan says stock dilution could drive 30% value drop. Both stocks are down about 73% and 51% this year',
 'At least a mild recession is already in the price, JPMorgan says. S&P 500 has fully priced in recession, Kolanovic says',
 'Inflation hit 9.1% in 12 months to June. JPMorgan’s Q2 earnings miss disappoints, but CEO backs away from ‘hurricane’',
 'JPMorgan’s Michele says clients are returning to bonds. Pacific Investment’s Browne sees negative growth over next year',
 'Economic advisor Ustenko has asked banks to cut ties. JPMorgan, HSBC and Citi have said they have cut back in Russia',
 'Education tech company Nerdy among JPMorgan’s top tech picks.',
 'Have you tried going to Newsround?',
 "Charles Lim to be bank's global head for quantum communications. JPMorgan has been exploring possible uses for the technology",
 'FT Global offers comprehensive industry coverage, in-depth industry analysis.',
 'Bank is more optimistic about the American consumer. Main Street banks are more 

# Adding sentiment analysis

In [30]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [31]:
sentiment(summaries['JPM'])

[{'label': 'NEGATIVE', 'score': 0.9995884299278259},
 {'label': 'NEGATIVE', 'score': 0.9970679879188538},
 {'label': 'NEGATIVE', 'score': 0.9846699237823486},
 {'label': 'NEGATIVE', 'score': 0.9984169006347656},
 {'label': 'NEGATIVE', 'score': 0.9021440148353577},
 {'label': 'POSITIVE', 'score': 0.9971441626548767},
 {'label': 'NEGATIVE', 'score': 0.9915708303451538},
 {'label': 'POSITIVE', 'score': 0.9980789422988892},
 {'label': 'POSITIVE', 'score': 0.9989489912986755},
 {'label': 'NEGATIVE', 'score': 0.9719294905662537}]

In [32]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'AAPL': [{'label': 'NEGATIVE', 'score': 0.9938549399375916},
  {'label': 'NEGATIVE', 'score': 0.9880996346473694},
  {'label': 'NEGATIVE', 'score': 0.9993987083435059},
  {'label': 'POSITIVE', 'score': 0.9997510313987732},
  {'label': 'NEGATIVE', 'score': 0.9676677584648132},
  {'label': 'NEGATIVE', 'score': 0.9950457811355591},
  {'label': 'POSITIVE', 'score': 0.9944818019866943},
  {'label': 'NEGATIVE', 'score': 0.5056257247924805},
  {'label': 'POSITIVE', 'score': 0.9815096855163574},
  {'label': 'NEGATIVE', 'score': 0.9994269609451294}],
 'BTC': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9948022365570068},
  {'label': 'NEGATIVE', 'score': 0.9995356798171997},
  {'label': 'POSITIVE', 'score': 0.9746564626693726},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9977815747261047},
  {'label': 'NEGATIVE', 'score': 0.9945539236068726},
  {'label': 

In [42]:
print(summaries['AAPL'][9], scores['AAPL'][9]['label'], scores['AAPL'][9]['score'])

Analysts expect iPhone sales to have dropped 2.3% this quarter. NEGATIVE 0.9994269609451294


# Exporting results to CSV

In [43]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [44]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['AAPL',
  'Starbucks to buy back some of its own stock.',
  'NEGATIVE',
  0.9938549399375916,
  'https://www.cnbc.com/2022/07/26/bank-of-america-cuts-apple-price-target-citing-foreign-exchange-headwinds.html'],
 ['AAPL',
  'All images are copyrighted.',
  'NEGATIVE',
  0.9880996346473694,
  'https://www.investors.com/news/technology/aapl-stock-apple-earnings-fiscal-third-quarter-2022/'],
 ['AAPL',
  'Analysts expect Apple to report slower growth than a year ago. Apple warned in April of a $4 billion to $8 billion revenue hit',
  'NEGATIVE',
  0.9993987083435059,
  'https://www.cnbc.com/2022/07/26/apple-q3-2022-earnings-preview-macroeconomic-concerns-dominate.html'],
 ['AAPL',
  'The company is the world’s most valuable, with a market value of over $800 billion.',
  'POSITIVE',
  0.9997510313987732,
  'https://www.usatoday.com/story/tech/2022/07/27/who-owns-apple-biggest-shareholders/10110250002/'],
 ['AAPL',
  'All photographs subject to copyright.',
  'NEGATIVE',
  0.967667758464813

In [45]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [47]:
import csv
with open('SentimentSummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)