## 1. Install and Import Baseline Dependencies

In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline
import csv

## 2. Setup Model

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

 ## 3. Setup Pipeline

In [3]:
# monitored_tickers = ['ETH']

In [4]:
# Take input from the user for the monitored tickers
monitored_tickers = input("Enter A List Of Tickers To Monitor (separated by spaces): ").split()

Enter A List Of Tickers To Monitor (separated by spaces): JPM BAC NKE


## 4.1. Search for Stock News using Google and Yahoo Finance

In [5]:
print('Searching For Stock News For', monitored_tickers)
def search_for_stock_news_links(ticker):
    search_url = 'https://www.google.com/search?q=yahoo+finance+{}&tbm=nws'.format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

Searching For Stock News For ['JPM', 'BAC', 'NKE']


In [6]:
raw_urls = {ticker:search_for_stock_news_links(ticker) for ticker in monitored_tickers}

## 4.2. Strip - Out Unwanted URLs

In [7]:
print('Cleaning URLs...')
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exc in url for exc in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker] , exclude_list) for ticker in monitored_tickers} 

Cleaning URLs...


## 4.3. Search and Scrape Cleaned URLs

In [8]:
print('Scraping News Links...')
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        results = soup.find_all('p')
        text = [res.text for res in results]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

Scraping News Links...


## 4.4. Summarise all Articles

In [9]:
print('Summarizing Articles...')
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors="pt")
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}

Summarizing Articles...


## 5. Adding Sentiment Analysis

In [10]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

## 6. Exporting Results

In [11]:
import datetime

print('Exporting Results...')

def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                            ticker, 
                            summaries[ticker][counter], 
                            scores[ticker][counter]['label'], 
                            scores[ticker][counter]['score'], 
                            urls[ticker][counter]
                          ]
            output.append(output_this)
    return output

final_output = create_output_array(summaries, scores, cleaned_urls)
final_output.insert(0, ['Ticker','Summary', 'Sentiment', 'Sentiment Score', 'URL'])

# Get the current date and time in the format DD_MM_YYYY_HH_MM_SS
current_date_time = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')

# Construct the file name using the current date and time
file_name = f'summaries_{current_date_time}.csv'

# Open the file for writing
with open(file_name, mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

Exporting Results...


In [12]:
print('Done!')

Done!


In [14]:
## Umang Laad
## 20100BTCSDSI07300

## Anaconda3 Prompt Commands

In [None]:
## cd "C:\Users\laad_"
## python Stock-and-Crypto-News-ScrapingSummarizationSentiment_1.1.py