In [3]:
import concurrent.futures
import pandas as pd
import numpy as np
import yfinance as yf
import requests as r
import io
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import plotly.express as px
import time
import random
# NLTK VADER for sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [9]:
data_link = 'https://www.dropbox.com/s/24xp45vce82k4qs/top50.csv?raw=1'

response = r.get(data_link)
content = io.StringIO(response.text)
data = pd.read_csv(content)

stocks = data['Symbol']
#sector = data['Sector']

tickers = stocks

In [10]:
# Helper function to fetch Finviz news tables
def fetch_finviz(ticker):
    time.sleep(random.uniform(1.0, 4.0))
    finwiz_url = 'https://finviz.com/quote.ashx?t='
    url = finwiz_url + ticker
    success = False
    attempts = 0
    while not success and attempts < 5:  # Retry limit of 5, adjust as needed
        try:
            time.sleep(random.uniform(1.0, 4.0))  # Increase sleep time range
            req = Request(url=url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})
            response = urlopen(req)
            html = BeautifulSoup(response)
            news_table = html.find(id='news-table')
            success = True
        except urllib.error.HTTPError as e:
            attempts += 1
            print(f"Error {e.code}: {e.reason}. Retrying attempt {attempts} for ticker {ticker}.")
    return ticker, news_table

# Helper function to fetch Yahoo Finance data
def fetch_yf_data(ticker):
    ticker_data = yf.Ticker(ticker)
    current_price = np.nan
    sector = np.nan
    industry = np.nan

    try:
        current_price = ticker_data.info['currentPrice']
    except KeyError:
        current_price = np.nan

    try:
        sector = ticker_data.info['sector']
    except KeyError:
        sector = np.nan

    try:
        industry = ticker_data.info['industry']
    except KeyError:
        industry = np.nan

    return ticker, sector, industry, current_price

In [11]:
# Fetch Finviz news tables concurrently
news_tables = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(fetch_finviz, ticker): ticker for ticker in tickers}
    for future in concurrent.futures.as_completed(futures):
        ticker, news_table = future.result()
        news_tables[ticker] = news_table

# Fetch Yahoo Finance data concurrently
ticker_data = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(fetch_yf_data, ticker): ticker for ticker in tickers}
    for future in concurrent.futures.as_completed(futures):
        ticker, sector, industry, current_price = future.result()
        ticker_data[ticker] = {'sector': sector, 'industry': industry, 'current_price': current_price}

In [14]:
news_tables

{'AIG': <table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer" id="news-table" width="100%">
 <tr class="cursor-pointer" onclick="trackAndOpenNews(event, 'Business Wire', 'https://finance.yahoo.com/news/aig-report-second-quarter-2023-201600503.html');">
 <td align="right" width="130">
             Jul-06-23 04:16PM
         </td>
 <td align="left">
 <div class="news-link-container">
 <div class="news-link-left">
 <a class="tab-link-news" href="https://finance.yahoo.com/news/aig-report-second-quarter-2023-201600503.html" rel="nofollow" target="_blank">AIG to Report Second Quarter 2023 Financial Results on August 1, 2023, and Host Conference Call on August 2</a>
 </div>
 <div class="news-link-right">
 <span>(Business Wire)</span></div></div></td></tr>
 <tr class="cursor-pointer" onclick="trackAndOpenNews(event, 'Zacks', 'https://finance.yahoo.com/news/met-aig-multiline-insurance-stock-150800349.html');">
 <td align="right" width="130">
             11:08AM
        

In [15]:
##### Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # Check if the 'a' tag exists within the 'tr' tag
        if x.a is not None:
            # Read the text from the 'a' tag
            text = x.a.get_text()
            date_scrape = x.td.text.split()
            # if the length of 'date_scrape' is 1, load 'time' as the only element
            if len(date_scrape) == 1:
                time = date_scrape[0]
                
            # else load 'date' as the 1st element and 'time' as the second    
            else:
                date = date_scrape[0]
                time = date_scrape[1]
            # Extract the ticker from the file name, get the string up to the 1st '_'  
            ticker = file_name.split('_')[0]
            
            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, text])

        else:
            # Handle cases where the 'a' tag is not found as desired
            text = "Not found"  # Or any other value or action depending on your needs

In [18]:
##### Perform Sentiment Analysis with Vader
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date

In [20]:
# Group by each ticker and get the mean of all sentiment scores
mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()

  mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()


In [22]:
# dictionary {'column name': list of values for column} to be converted to dataframe
d = {'Sector': sector, 'Industry': industry, 'Price': current_price}
# create dataframe from 
df_info = pd.DataFrame(data=d, index=tickers)
df_info

Unnamed: 0_level_0,Sector,Industry,Price
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,Energy,Oil & Gas Integrated,102.43
ABBV,Energy,Oil & Gas Integrated,102.43
ABT,Energy,Oil & Gas Integrated,102.43
ACN,Energy,Oil & Gas Integrated,102.43
ADBE,Energy,Oil & Gas Integrated,102.43
...,...,...,...
VZ,Energy,Oil & Gas Integrated,102.43
WBA,Energy,Oil & Gas Integrated,102.43
WFC,Energy,Oil & Gas Integrated,102.43
WMT,Energy,Oil & Gas Integrated,102.43
