### Data Collection


1. GET S&P 500 company info<br>


In [35]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [36]:
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient
from pathlib import Path
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to C:\Users\Straw
[nltk_data]     Hat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### 1. Get S&P 500 Data

In [37]:
# Get table of the S&P 500 tickers, CIK, and industry from Wikipedia
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cik_df = pd.read_html(wiki_url,header=0,index_col=0)[0]
cik_df['GICS Sector'] = cik_df['GICS Sector'].astype("category")
cik_df['GICS Sub Industry'] = cik_df['GICS Sector'].astype("category")
cik_df.head(100)

Unnamed: 0_level_0,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded,GICS Sub Industry
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902,Industrials
AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916,Industrials
ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888,Health Care
ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888),Health Care
ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981,Health Care
...,...,...,...,...,...,...,...,...,...
CE,Celanese,reports,Materials,Specialty Chemicals,"Irving, Texas",2018-12-24,1306830,1918,Materials
CNC,Centene,reports,Health Care,Managed Health Care,"St. Louis, Missouri",2016-03-30,1071739,1984,Health Care
CNP,CenterPoint Energy,reports,Utilities,Multi-Utilities,"Houston, Texas",1985-07-31,1130310,1882,Utilities
CDAY,Ceridian,reports,Information Technology,Application Software,"Minneapolis, Minnesota",2021-09-20,1725057,1992,Information Technology


In [38]:
from newsapi import NewsApiClient
load_dotenv()
newsapi = NewsApiClient(api_key=os.environ["NEWS_API_KEY"])

In [39]:
headlines = newsapi.get_everything(
    q="S&P 500" "",
    language="en",
    page_size=100,
    sort_by="relevancy"
)

In [40]:
# Create the S&p 500 sentiment scores DataFrame
sentiments = []

for article in headlines["articles"]:
    try:
        
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({
            
            
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
            "Text": text
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
df = pd.DataFrame(sentiments)

# Reorder DataFrame columns
cols = ["Compound", "Positive", "Negative", "Neutral","Text"]
df = df[cols]

df.head(10)

Unnamed: 0,Compound,Positive,Negative,Neutral,Text
0,0.6369,0.148,0.0,0.852,Feb 11 - Welcome to the home for real-time cov...
1,-0.6249,0.0,0.124,0.876,Posted \r\nWall Street's main indexes fell on ...
2,0.1531,0.046,0.0,0.954,There are plenty of solid reasons why investor...
3,0.6369,0.148,0.0,0.852,Feb 9 - Welcome to the home for real-time cove...
4,0.6908,0.151,0.0,0.849,Feb 10 (Reuters) - Futures tracking the S&amp;...
5,0.6369,0.144,0.0,0.856,Feb 11 - Welcome to the home for real-time cov...
6,-0.4404,0.0,0.086,0.914,NEW YORK (Reuters) - The S&amp;P 500 index end...
7,0.6369,0.152,0.0,0.848,Feb 7 - Welcome to the home for real-time cove...
8,-0.5859,0.0,0.127,0.873,Feb 14 (Reuters) - Goldman Sachs has tempered ...
9,0.0,0.0,0.0,1.0,"NEW YORK, Feb 22 (Reuters) - The S&amp;P 500's..."


In [41]:
df.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,100.0,100.0,100.0,100.0
mean,0.168539,0.1016,0.05414,0.84423
std,0.532687,0.081072,0.069649,0.082521
min,-0.836,0.0,0.0,0.633
25%,-0.307,0.0,0.0,0.79125
50%,0.2616,0.118,0.0,0.8495
75%,0.6369,0.152,0.11125,0.87375
max,0.91,0.287,0.282,1.0


In [42]:
# Instantiate the lemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Expand the default stopwords list if necessary
sw = set(stopwords.words('english'))

In [43]:
def tokenizer(text):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    
    regex = re.compile("[^a-zA-Z ]")
   
    # Create a tokenized list of the words
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean)
    words = word_tokenize(re_clean)
    words = word_tokenize(re_clean.lower())
    
    # Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Convert the words to lowercase
    words = [word for word in words if word not in sw]
    
    # Remove the stop words
   
    return lem

tokenizer(df.iloc[0]['Text'])

['feb',
 'welcome',
 'to',
 'the',
 'home',
 'for',
 'realtime',
 'coverage',
 'of',
 'market',
 'brought',
 'to',
 'you',
 'by',
 'reuters',
 'reporter',
 'you',
 'can',
 'share',
 'your',
 'thought',
 'with',
 'u',
 'at',
 'marketsresearchthomsonreuterscomsampp',
 'looking',
 'fo',
 'char']

In [44]:
df['tokens'] = df['Text'].apply(tokenizer)
df

Unnamed: 0,Compound,Positive,Negative,Neutral,Text,tokens
0,0.6369,0.148,0.000,0.852,Feb 11 - Welcome to the home for real-time cov...,"[feb, welcome, to, the, home, for, realtime, c..."
1,-0.6249,0.000,0.124,0.876,Posted \r\nWall Street's main indexes fell on ...,"[posted, wall, street, main, index, fell, on, ..."
2,0.1531,0.046,0.000,0.954,There are plenty of solid reasons why investor...,"[there, are, plenty, of, solid, reason, why, i..."
3,0.6369,0.148,0.000,0.852,Feb 9 - Welcome to the home for real-time cove...,"[feb, welcome, to, the, home, for, realtime, c..."
4,0.6908,0.151,0.000,0.849,Feb 10 (Reuters) - Futures tracking the S&amp;...,"[feb, reuters, future, tracking, the, sampp, a..."
...,...,...,...,...,...,...
95,0.8531,0.243,0.000,0.757,Feb 25 - Welcome to the home for real-time cov...,"[feb, welcome, to, the, home, for, realtime, c..."
96,-0.6249,0.000,0.150,0.850,NEW YORK (Reuters) - Geopolitical worries are ...,"[new, york, reuters, geopolitical, worry, are,..."
97,0.6369,0.123,0.000,0.877,Fortune 500 fintech company Global Payments (G...,"[fortune, fintech, company, global, payment, g..."
98,0.8038,0.215,0.000,0.785,Feb 28 - Welcome to the home for real-time cov...,"[feb, welcome, to, the, home, for, realtime, c..."


In [45]:
from collections import Counter
from nltk import ngrams

In [46]:
def get_token(df):
    tokens = []
    for i in df['tokens']:
        tokens.extend(i)
    return tokens

tokens = get_token(df)


In [47]:
def bigram_counter(tokens, N):
    words_count = dict(Counter(ngrams(tokens, n=2)))
    return words_count

In [48]:
# Function token_count generates the top 10 words for a given stock
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [58]:
token_count(tokens, 10)

[('the', 160),
 ('to', 116),
 ('char', 100),
 ('of', 76),
 ('you', 70),
 ('a', 69),
 ('u', 61),
 ('for', 57),
 ('reuters', 56),
 ('on', 53)]

In [50]:
import spacy
from spacy import displacy

In [51]:
nlp = spacy.load('en_core_web_sm')

In [52]:
text = ' '.join(df['Text'])
text

'Feb 11 - Welcome to the home for real-time coverage of markets brought to you by Reuters reporters. You can share your thoughts with us at markets.research@thomsonreuters.com\r\nS&amp;P 500: LOOKING FO… [+2166 chars] Posted \r\nWall Street\'s main indexes fell on Tuesday, with the S&amp;P 500 confirming a correction, as the Ukraine-Russia crisis kept investors on edge after Russian President Vladimir Putin recognize… [+107 chars] There are plenty of solid reasons why investors view the Standard &amp; Poors 500 Index (S&amp;P 500) as a benchmark for both the economy and their individual portfolios. This index contains some of … [+4559 chars] Feb 9 - Welcome to the home for real-time coverage of markets brought to you by Reuters reporters. You can share your thoughts with us at markets.research@thomsonreuters.com\r\nS&amp;P 500, NASDAQ UP 1… [+5128 chars] Feb 10 (Reuters) - Futures tracking the S&amp;P 500 and the Nasdaq slipped on Thursday after the indexes notched strong gains in the 

In [53]:
doc = nlp(text)

In [54]:
# Add a title to the document
doc.user_data["title"] = "NER"

In [55]:
# Render the visualization
displacy.render(doc, style='ent')

In [56]:
# List all Entities
for ent in doc.ents:
    print('{} {}'.format(ent.text, ent.label_))

Reuters ORG
500 CARDINAL
Tuesday DATE
500 CARDINAL
Ukraine GPE
Russia GPE
Russian NORP
Vladimir Putin PERSON
Standard & ORG
500 CARDINAL
500 CARDINAL
Reuters ORG
500 CARDINAL
NASDAQ ORG
1 CARDINAL
Reuters ORG
500 CARDINAL
Nasdaq ORG
Thursday DATE
two CARDINAL
U.S. GPE
Reuters ORG
500 CARDINAL
EDGES ORG
NEW YORK GPE
Reuters ORG
500 CARDINAL
Monday DATE
late-afternoon TIME
U.S. GPE
Ukraine GPE
Reuters ORG
500 CARDINAL
NASDAQ FALL GPE
Feb 14 DATE
Reuters ORG
Goldman Sachs ORG
500 CARDINAL
5,000 CARDINAL
this year DATE
NEW YORK GPE
Feb 22 DATE
Reuters ORG
500 CARDINAL
1% PERCENT
Tuesday DATE
first ORDINAL
2020 DATE
Reuters ORG
500 CARDINAL
Reuters ORG
RBC ORG
5,050 CARDINAL
March 1 - Welcome DATE
Reuters ORG
BIDEN'S ORG
Reuters ORG
500 CARDINAL
EASES ORG
Reuters ORG
500 CARDINAL
Reuters ORG
500 CARDINAL
Reuters ORG
500 CARDINAL
Reuters ORG
Reuters ORG
SUPER BOWL SHOWDOWN ORG
WIL ORG
Reuters ORG
70% PERCENT
500 CARDINAL
Reuters ORG
500 CARDINAL
Reuters ORG
U.S. GPE
UK GPE
European NORP
Bide