### Data Collection


1. GET S&P 500 company info<br>


In [182]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [183]:
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient
from pathlib import Path
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jakek\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### 1. Get S&P 500 Data

In [184]:
# Get table of the S&P 500 tickers, CIK, and industry from Wikipedia
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cik_df = pd.read_html(wiki_url,header=0,index_col=0)[0]
cik_df['GICS Sector'] = cik_df['GICS Sector'].astype("category")
cik_df['GICS Sub Industry'] = cik_df['GICS Sector'].astype("category")
cik_df.tail(100)

Unnamed: 0_level_0,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded,GICS Sub Industry
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
RHI,Robert Half,reports,Industrials,Human Resource & Employment Services,"Menlo Park, California",2000-12-05,315213,1948,Industrials
ROK,Rockwell Automation,reports,Industrials,Electrical Components & Equipment,"Milwaukee, Wisconsin",,1024478,1903,Industrials
ROL,Rollins,reports,Industrials,Environmental & Facilities Services,"Atlanta, Georgia",2018-10-01,84839,1948,Industrials
ROP,Roper,reports,Industrials,Industrial Conglomerates,"Sarasota, Florida",2009-12-23,882835,1981,Industrials
ROST,Ross,reports,Consumer Discretionary,Apparel Retail,"Dublin, California",2009-12-21,745732,1982,Consumer Discretionary
...,...,...,...,...,...,...,...,...,...
YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997,Consumer Discretionary
ZBRA,Zebra,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969,Information Technology
ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927,Health Care
ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873,Financials


In [185]:
from newsapi import NewsApiClient
load_dotenv()
newsapi = NewsApiClient(api_key=os.environ["NEWS_API_KEY"])

In [186]:
headlines = newsapi.get_everything(
    q="S&P 1000" "",
    language="en",
    page_size=100,
    sort_by="relevancy"
)

In [187]:
# Create the S&p 500 sentiment scores DataFrame
sentiments = []

for article in headlines["articles"]:
    try:
        
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({
            
            
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
            "Text": text
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
df = pd.DataFrame(sentiments)

# Reorder DataFrame columns
cols = ["Compound", "Positive", "Negative", "Neutral","Text"]
df = df[cols]

df.head(10)

Unnamed: 0,Compound,Positive,Negative,Neutral,Text
0,-0.6369,0.0,0.144,0.856,With volatility being the catchword now in the...
1,0.5574,0.162,0.044,0.795,Designed to provide broad exposure to the Larg...
2,-0.0516,0.07,0.101,0.829,If you're interested in broad exposure to the ...
3,-0.4215,0.05,0.132,0.819,Feb 22 (Reuters) - Futures for Canada's main s...
4,-0.3182,0.0,0.067,0.933,Feb 25 (Reuters) - Futures for Canada's main s...
5,0.5423,0.111,0.0,0.889,Feb 18 (Reuters) - Futures for Canada's main s...
6,-0.0516,0.043,0.048,0.909,"Launched on 12/23/2014, the Invesco Russell 10..."
7,-0.1779,0.0,0.048,0.952,Looking for broad exposure to the Mid Cap Blen...
8,-0.296,0.0,0.056,0.944,Feb 16 (Reuters) - Futures for Canada's main s...
9,0.9118,0.318,0.0,0.682,Wall Street has been witnessing some strength ...


In [188]:
df.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,100.0,100.0,100.0,100.0
mean,0.346973,0.10395,0.01986,0.87623
std,0.376791,0.078357,0.043097,0.081189
min,-0.7579,0.0,0.0,0.682
25%,0.0,0.04825,0.0,0.832
50%,0.3818,0.111,0.0,0.856
75%,0.6369,0.148,0.0,0.92625
max,0.9118,0.318,0.249,1.0


In [189]:
# Instantiate the lemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Expand the default stopwords list if necessary
sw = set(stopwords.words('english'))

In [190]:
def tokenizer(text):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    
    regex = re.compile("[^a-zA-Z ]")
   
    # Create a tokenized list of the words
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean)
    words = word_tokenize(re_clean)
    words = word_tokenize(re_clean.lower())
    
    # Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Convert the words to lowercase
    words = [word for word in words if word not in sw]
    
    # Remove the stop words
   
    return lem

tokenizer(df.iloc[0]['Text'])

['with',
 'volatility',
 'being',
 'the',
 'catchword',
 'now',
 'in',
 'the',
 'broader',
 'equity',
 'market',
 'due',
 'to',
 'a',
 'hawkish',
 'fed',
 'and',
 'geopolitical',
 'tension',
 'related',
 'to',
 'russia',
 'and',
 'ukraine',
 'investor',
 'might',
 'be',
 'clueless',
 'about',
 'the',
 'future',
 'movemen',
 'char']

In [191]:
df['tokens'] = df['Text'].apply(tokenizer)
df

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,tokens
0,-0.6369,0.000,0.144,0.856,With volatility being the catchword now in the...,"[with, volatility, being, the, catchword, now,..."
1,0.5574,0.162,0.044,0.795,Designed to provide broad exposure to the Larg...,"[designed, to, provide, broad, exposure, to, t..."
2,-0.0516,0.070,0.101,0.829,If you're interested in broad exposure to the ...,"[if, youre, interested, in, broad, exposure, t..."
3,-0.4215,0.050,0.132,0.819,Feb 22 (Reuters) - Futures for Canada's main s...,"[feb, reuters, future, for, canada, main, stoc..."
4,-0.3182,0.000,0.067,0.933,Feb 25 (Reuters) - Futures for Canada's main s...,"[feb, reuters, future, for, canada, main, stoc..."
...,...,...,...,...,...,...
95,0.4588,0.070,0.000,0.930,Welcome everyone to the131st edition of ‘Hot G...,"[welcome, everyone, to, thest, edition, of, ho..."
96,0.3400,0.070,0.000,0.930,"Smead Capital Management, an investment manage...","[smead, capital, management, an, investment, m..."
97,0.3818,0.075,0.000,0.925,Vitalii Stamat/iStock via Getty Images\r\nRamb...,"[vitalii, stamatistock, via, getty, imagesramb..."
98,0.7430,0.213,0.000,0.787,"DULUTH, Ga.--(BUSINESS WIRE)--Primerica, Inc. ...","[duluth, gabusiness, wireprimerica, inc, nysep..."


In [192]:
from collections import Counter
from nltk import ngrams

In [193]:
def get_token(df):
    tokens = []
    for i in df['tokens']:
        tokens.extend(i)
    return tokens

tokens = get_token(df)


In [194]:
def bigram_counter(tokens, N):
    words_count = dict(Counter(ngrams(tokens, n=2)))
    return words_count

In [195]:
# Function token_count generates the top 10 words for a given stock
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [196]:
token_count(tokens, 10)

[('the', 145),
 ('of', 107),
 ('char', 100),
 ('to', 83),
 ('a', 69),
 ('you', 52),
 ('for', 46),
 ('can', 44),
 ('with', 41),
 ('fund', 38)]

In [197]:
import spacy
from spacy import displacy

In [198]:
nlp = spacy.load('en_core_web_sm')

In [199]:
text = ' '.join(df['Text'])
text

"With volatility being the catchword now in the broader equity market due to a hawkish Fed and geopolitical tensions related to Russia and Ukraine, investors might be clueless about the future movemen… [+4981 chars] Designed to provide broad exposure to the Large Cap Value segment of the US equity market, the SPDR Portfolio S&amp;P 500 Value ETF (SPYV) is a passively managed exchange traded fund launched on 09/2… [+4403 chars] If you're interested in broad exposure to the Large Cap Blend segment of the US equity market, look no further than the Schwab 1000 Index ETF (SCHK), a passively managed exchange traded fund launched… [+4295 chars] Feb 22 (Reuters) - Futures for Canada's main stock index fell on Tuesday, with investors joining a global flight from risky assets, as tensions between Russia and Ukraine escalated, although stronger… [+2456 chars] Feb 25 (Reuters) - Futures for Canada's main stock index fell on Friday as commodity prices weakened, while Russia's invasion of Ukraine ke

In [200]:
doc = nlp(text)

In [201]:
# Add a title to the document
doc.user_data["title"] = "NER"

In [202]:
# Render the visualization
displacy.render(doc, style='ent')

In [203]:
# List all Entities
org_list = []

for ent in doc.ents:
    if ent.label_ == 'ORG':
        org_list.append(ent.text)

org_list = Counter(org_list).most_common(100)

df_org = pd.DataFrame(org_list, columns = ['text', 'count'])

df_org.head(50)

Unnamed: 0,text,count
0,Reuters,26
1,Getty Images,9
2,Polen Capital,9
3,Polen Focus Growth,9
4,Polen,9
5,RiverPark Large Growth Fund,9
6,The RiverPark Large Growth Fund,9
7,SPDR,2
8,ETF,2
9,AIX,2


In [204]:
# Dropping Reuters as it is a publisher
df_org = df_org.iloc[2:]
df_org

Unnamed: 0,text,count
2,Polen Capital,9
3,Polen Focus Growth,9
4,Polen,9
5,RiverPark Large Growth Fund,9
6,The RiverPark Large Growth Fund,9
...,...,...
79,"Ga.--(BUSINESS WIRE)--Primerica, Inc.",1
80,PRI,1
81,the Board of Directors,1
82,+996,1


In [None]:
headlines = newsapi.get_everything(
    q="Polen" "",
    language="en",
    page_size=100,
    sort_by="relevancy"
)

In [206]:
# Create the Polen sentiment scores DataFrame
sentiments = []

for article in headlines["articles"]:
    try:
        
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({
            
            
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
            "Text": text
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
df_polen = pd.DataFrame(sentiments)

# Reorder DataFrame columns
cols = ["Compound", "Positive", "Negative", "Neutral","Text"]
df_polen = df[cols]

df_polen.head(10)

Unnamed: 0,Compound,Positive,Negative,Neutral,Text
0,-0.6369,0.0,0.144,0.856,With volatility being the catchword now in the...
1,0.5574,0.162,0.044,0.795,Designed to provide broad exposure to the Larg...
2,-0.0516,0.07,0.101,0.829,If you're interested in broad exposure to the ...
3,-0.4215,0.05,0.132,0.819,Feb 22 (Reuters) - Futures for Canada's main s...
4,-0.3182,0.0,0.067,0.933,Feb 25 (Reuters) - Futures for Canada's main s...
5,0.5423,0.111,0.0,0.889,Feb 18 (Reuters) - Futures for Canada's main s...
6,-0.0516,0.043,0.048,0.909,"Launched on 12/23/2014, the Invesco Russell 10..."
7,-0.1779,0.0,0.048,0.952,Looking for broad exposure to the Mid Cap Blen...
8,-0.296,0.0,0.056,0.944,Feb 16 (Reuters) - Futures for Canada's main s...
9,0.9118,0.318,0.0,0.682,Wall Street has been witnessing some strength ...


In [207]:
df_polen.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,100.0,100.0,100.0,100.0
mean,0.346973,0.10395,0.01986,0.87623
std,0.376791,0.078357,0.043097,0.081189
min,-0.7579,0.0,0.0,0.682
25%,0.0,0.04825,0.0,0.832
50%,0.3818,0.111,0.0,0.856
75%,0.6369,0.148,0.0,0.92625
max,0.9118,0.318,0.249,1.0
