In [1]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers = ['AMZN', 'TSLA', 'GOOG']
for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
    response = urlopen(req)    
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

In [5]:
# Read one single day of headlines for ‘AMZN’ 
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into ‘amzn_tr’
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
  # Read the text of the element ‘a’ into ‘link_text’
  a_text = table_row.a.text
  # Read the text of the element ‘td’ into ‘data_text’
  td_text = table_row.td.text
  # Print the contents of ‘link_text’ and ‘data_text’ 
  print(a_text)
  print(td_text)
  # Exit after printing 4 rows of data
  if i == 3:
    break

Nvidia Revolutionizes the Cloud -- What Nvidia Investors Should Know
Mar-21-23 05:56PM
Best Stock to Buy: Amazon vs. Salesforce
04:44PM
Big Tech companies could see even more layoffs ahead: Analyst
04:24PM
Why Rivian, Ford, and General Motors Jumped Today
04:15PM


In [14]:
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
            ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, text])
      

In [15]:
parsed_news[:5] # print first 5 rows of news

[['AMZN',
  'Mar-21-23',
  '05:56PM',
  'Mar-21-23 05:56PMNvidia Revolutionizes the Cloud -- What Nvidia Investors Should Know Motley Fool'],
 ['AMZN',
  'Mar-20-23',
  '07:50PM',
  'Mar-20-23 07:50PMAmazon to Cut 9,000 More Jobs After Earlier Layoffs The Wall Street Journal'],
 ['AMZN',
  'Mar-19-23',
  '08:00AM',
  'Mar-19-23 08:00AM3 Tech Stocks That Can Weather a Recession and Thrive on the Other Side Motley Fool'],
 ['AMZN',
  'Mar-18-23',
  '06:15PM',
  'Mar-18-23 06:15PMGraham Stephan Stock Portfolio: 10 Stock Picks Insider Monkey'],
 ['AMZN',
  'Mar-17-23',
  '06:49PM',
  'Mar-17-23 06:49PMWeekly Roundup TheStreet.com']]

In [17]:
parsed_news[:10]

[['AMZN',
  'Mar-21-23',
  '05:56PM',
  'Mar-21-23 05:56PMNvidia Revolutionizes the Cloud -- What Nvidia Investors Should Know Motley Fool'],
 ['AMZN',
  'Mar-20-23',
  '07:50PM',
  'Mar-20-23 07:50PMAmazon to Cut 9,000 More Jobs After Earlier Layoffs The Wall Street Journal'],
 ['AMZN',
  'Mar-19-23',
  '08:00AM',
  'Mar-19-23 08:00AM3 Tech Stocks That Can Weather a Recession and Thrive on the Other Side Motley Fool'],
 ['AMZN',
  'Mar-18-23',
  '06:15PM',
  'Mar-18-23 06:15PMGraham Stephan Stock Portfolio: 10 Stock Picks Insider Monkey'],
 ['AMZN',
  'Mar-17-23',
  '06:49PM',
  'Mar-17-23 06:49PMWeekly Roundup TheStreet.com'],
 ['AMZN',
  'Mar-16-23',
  '06:01PM',
  'Mar-16-23 06:01PMWhy Amazon Stock Jumped Today Motley Fool'],
 ['AMZN',
  'Mar-15-23',
  '03:29PM',
  'Mar-15-23 03:29PMHow Much Do Memberships Cost at Costco, Sams Club and Other Warehouse Stores? GOBankingRates'],
 ['AMZN',
  'Mar-14-23',
  '06:33PM',
  'Mar-14-23 06:33PMThe 10 Best Stocks to Buy in March 2023 Motley F

In [16]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,2023-03-21,05:56PM,Mar-21-23 05:56PMNvidia Revolutionizes the Clo...,0.195,0.805,0.0,-0.4404
1,AMZN,2023-03-20,07:50PM,"Mar-20-23 07:50PMAmazon to Cut 9,000 More Jobs...",0.139,0.861,0.0,-0.2732
2,AMZN,2023-03-19,08:00AM,Mar-19-23 08:00AM3 Tech Stocks That Can Weathe...,0.289,0.711,0.0,-0.6908
3,AMZN,2023-03-18,06:15PM,Mar-18-23 06:15PMGraham Stephan Stock Portfoli...,0.0,1.0,0.0,0.0
4,AMZN,2023-03-17,06:49PM,Mar-17-23 06:49PMWeekly Roundup TheStreet.com,0.0,1.0,0.0,0.0
