In [1]:
from iexfinance.stocks import Stock
import pandas as pd
from newsapi.newsapi_client import NewsApiClient
from datetime import date, datetime, timedelta
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Read your api key environment variable

api_key = os.getenv("news_api")


In [3]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)

In [4]:
# set date range for downloading headlines
start_date = date.today() - timedelta(weeks=4)# ahere we can input exact timing we want to download data for, in days....
end_date = date.today()

print(f"""
start date: {start_date}

end date: {end_date}
"""
)


start date: 2020-03-11

end date: 2020-04-08



In [7]:
# Fetch the S&P500 news articles

def get_headlines(keyword):
    """
    Using a given keywords, connects to NewsApi.org servers to download
    headlines for a given range of dates for that keyword.
    
    Params
    -------------
    keywords: string, the words you want to search for in headlines
    
    Returns
    -------------
    all_articles: list, a collection of articles ordered by data
    all_dates: list, a collection of dates associated with the collection all_headlines
    
    """
    
    
    # create an empty list to store headlines    
    all_headlines = []
    
    # create an empty list to store the dates
    all_dates = []
    
    # initalize the date variable we will be looping through
    # the query will look for headlines on a given date
    date = end_date
    
    # human friendly output
    print(f"Fetching news about: {keyword}")
    print('*' * 30) # prints a string of asterisks repeated 30 times
    
    # begin loop
    # date starts with today (defined outside of function above)
    # at the end of loop we reassign date to be the value of the day before the last loop run
    # eventually the date will be before the last date we want to run (defined outside loop above)
    
    while date > start_date:
        
        #human friendly output
        print(f'retrieving news from: {date}')
        
        # run a query using the news api client
        # querying for the keyword for the given date we are looping through
        articles = newsapi.get_everything(
                            q=keyword,
                            from_param=str(date),
                            to=str(date),
                            language='en',
                            sort_by='relevancy',
                            page=1
        )
        
        # create an empty list to store headline results
        headlines=[]
        
        # loop through all the articles returned for a given day and store
        # the title of the article to headlines list
        for i in range(0, len(articles['articles'])):
            headlines.append(articles['articles'][i]['title'])
        
        # append the headlines list to the all_headlines list for the keyword
        all_headlines.append(headlines)
        
        # append the date we just ran for so it can used for aligning with the returns data and other
        # headlines
        all_dates.append(date)
        
        # step back one more day in time and reevalute the while 
        # then determine if another loop is appropriate
        date = date-timedelta(days=1)
    
    return all_headlines, all_dates

In [8]:
# test the function, download news headlines and get dates
SP500_headlines, dates = get_headlines("S&P 500")
Trump_headlines, _ = get_headlines('Donald Trump AND S&P 500')
Economists_headlines, _ = get_headlines('Leading economists AND S&P 500')
Cramer_headlines, _ = get_headlines('Jim Cramer AND S&P 500')
Dimon_headlines, _ = get_headlines('Jamie Dimon AND S&P 500')
Solomon_headlines, _ = get_headlines('David Solomon AND S&P 500')
Lloyd_headlines, _ = get_headlines('Lloyd Blankfein AND S&P 500')
Corbat_headlines, _ = get_headlines('Michael Corbat AND S&P 500')

Fetching news about: S&P 500
******************************
retrieving news from: 2020-04-08
retrieving news from: 2020-04-07
retrieving news from: 2020-04-06
retrieving news from: 2020-04-05
retrieving news from: 2020-04-04
retrieving news from: 2020-04-03
retrieving news from: 2020-04-02
retrieving news from: 2020-04-01
retrieving news from: 2020-03-31
retrieving news from: 2020-03-30
retrieving news from: 2020-03-29
retrieving news from: 2020-03-28
retrieving news from: 2020-03-27
retrieving news from: 2020-03-26
retrieving news from: 2020-03-25
retrieving news from: 2020-03-24
retrieving news from: 2020-03-23
retrieving news from: 2020-03-22
retrieving news from: 2020-03-21
retrieving news from: 2020-03-20
retrieving news from: 2020-03-19
retrieving news from: 2020-03-18
retrieving news from: 2020-03-17
retrieving news from: 2020-03-16
retrieving news from: 2020-03-15
retrieving news from: 2020-03-14
retrieving news from: 2020-03-13
retrieving news from: 2020-03-12
Fetching news ab

In [9]:
# count total headlines returned
headlines = 0
for date in SP500_headlines:
    headlines += len(date)

In [10]:
# count total headlines returned
headlines = 0
for date in Economists_headlines:
    headlines += len(date)

In [11]:
sid = SentimentIntensityAnalyzer()

In [12]:
def headline_sentiment_summarizer_avg(headlines):
    """
    Uses VADER SentimentIntensityAnalyzer to score headlines. 
    
    Params
    -----------------------------
    headlines: list of lists, collection of headlines or other text to be score.
                The outer list is the collection of headlines by days, where
                each entry represents a day, and the inner list for that day
                is a collection of headlines
    
    Returns
    ------------------------------
    sentiment: float, average score for all headlines provided
    
    """
    
    # create an empty list to store all scored days
    sentiment = []
    
    # loop through each day in headlines (loop through outer list)
    for day in headlines:
        
        # an empty list to store the sentiment score values for each
        # headline in a given day
        day_score = []
        
        # loop through each headline for a given day (inner loop)
        for h in day:
            
            # pass if no headlines for a day
            if h == None:
                continue
                
            # otherwise, score the headline and add to day_score    
            else:
                day_score.append(sid.polarity_scores(h)["compound"])
        
        # once all headlines for a day are scored (inner loop finished)
        # get the average value of the headlines for a given day
        # (average of inner loop results) and append as the daily average score
        sentiment.append(sum(day_score) / len(day_score))
        
    # return the vector or column of scores each related to one day of headlines
    return sentiment

In [None]:
# for each topic, produce a vector or column of average sentiment scores
# for each day

SP500_avg = headline_sentiment_summarizer_avg(SP500_headlines)
Trump_avg = headline_sentiment_summarizer_avg(Trump_headlines)
economy_avg = headline_sentiment_summarizer_avg(Economists_headlines)
Cramer_avg = headline_sentiment_summarizer_avg(Cramer_headlines)
Dimon_avg = headline_sentiment_summarizer_avg(Dimon_headlines)
Solomon_avg = headline_sentiment_summarizer_avg(Solomon_headlines)
Lloyd_avg = headline_sentiment_summarizer_avg(Lloyd_headlines)
Corbat_avg = headline_sentiment_summarizer_avg(Corbat_headlines)

In [None]:
# put it all in a dataframe for easier analysis and movement

topic_sentiments = pd.DataFrame(
    {
        'SP500': SP500_avg,
        'Trump': Trump_avg,
        'Economists': economy_avg,
        'Cramer': Cramer_avg,
        'Dimon': Dimon_avg,
        'Solomon': Solomon_avg ,
        'Lloyd': Lloyd_avg, 
        'Corbat': Corbat_avg
        
    }
)

In [None]:
# re-associate the dates with the average scores produced

topic_sentiments.index = pd.to_datetime(dates)

In [None]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#Troy- import stocks df


In [None]:
# join or merge the sentiments with the core SPY returns dataframe
#topic_sentiments = XXXXXX_df.join(topic_sentiments).dropna(how='any')