**Import libraries**

In [271]:
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
from operator import itemgetter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import requests
import plotly.graph_objects as go
from pandas_datareader import data as pdr

#yahoo finance and pandadata reader override
import yfinance as yf
yf.pdr_override()

#machine learning
from sklearn.linear_model import LinearRegression

**Try out Vader Sentiment Analyzer**

In [272]:
#call analyser object
analyser = SentimentIntensityAnalyzer()

In [273]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score

In [274]:
sentiment_analyzer_scores("Today is an okay day.")

{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.2263}

In [275]:
sentiment_analyzer_scores("Today is an amazing day!")

{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6239}

In [276]:
sentiment_analyzer_scores("Today is an amazing day! Vader is working but it took a while")

{'neg': 0.0, 'neu': 0.803, 'pos': 0.197, 'compound': 0.4003}

In [277]:
sentiment_analyzer_scores("Today kind of sucked.")

{'neg': 0.523, 'neu': 0.477, 'pos': 0.0, 'compound': -0.5095}

**Pull in Live News Data from News API**
<p>News API only display news up to a month old</p>

In [278]:
#enter company and api key
company = input('company: ')
stock_ticker = input('stock ticker: ')
api_key = input('api key: ')

company: starbucks
stock ticker: sbux
api key: 798c7707bbe8404199296c0521a99f61


<h4 style='background:#ffbdb3'>Major limitation of News API: Free version has max of 100 results per call; 1 month old max; 500 request per day</h4>
<h4>To get around this, we will make a request for each day, and compile the first 100 most popular results for each day</h4>

In [279]:
#get dates, 28 days from today (28 days max as per free api)
today = date.today() - timedelta(0)
numdays = 28
date_list = [today - timedelta(days=x) for x in range(numdays)]

In [280]:
#try return news for 1 day, make 7 requests, sort by most popular results
total_res = []

for x in range(0, numdays):  
    url = ('http://newsapi.org/v2/everything?'
           'q=' + company + '&'
           'from=' + str(date_list[x]) + '&'
           'to=' + str(date_list[x]) + '&'
           'language=en&'
           'sortBy=popularity&'
           'pageSize=100&'
           'apiKey=' + api_key)
    response = requests.get(url)
    total_res.append(response.json())


#print('Total Results: ' + str(response.json().get('totalResults')))

#print(*total_res, sep = '\n')   

{'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 500 requests over a 24 hour period (250 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}
{'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 500 requests over a 24 hour period (250 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}
{'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 500 requests over a 24 hour period (250 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}
{'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 500 requests over a 24 hour period (250 requests available every 12 hou

In [282]:
#put all 'articles' in 1 list, then remove the nested list
articles = []
for x in range(0, len(total_res)):
    articles.append(total_res[x]['articles'])

news = []
def removeNesting(nestedList):
    for i in nestedList:
        if type(i) == list:
            removeNesting(i)
        else: 
            news.append(i)
            
removeNesting(articles)
print(news)

KeyError: 'articles'

**Perform Analysis on the News Title**

In [None]:
#news = response.json().get('articles')

#add title score:
na_score = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}

for item in news:
    if item['title'] is None:
        item.update({'title_score': na_score})
    else:
        item.update({'title_score': sentiment_analyzer_scores(item['title'])})
    if item['description'] is None:
        item.update({'desc_score': na_score})
    else:
        item.update({'desc_score': sentiment_analyzer_scores(item['description'])})

In [None]:
news_df = pd.DataFrame(news)
news_df.sort_values(by=['publishedAt'])

#replace all NaN score with 0 

print('row and column: ' + str(news_df.shape))
news_df.head(30)

In [None]:
#other ways to add title score object to news


#add title score way 2 -----------------------:

# def add_title_scoore():
#     for item in news:
#         score = {'title_score' : sentiment_analyzer_scores(item['title'])}
#     return news
# add_title_scoore()


#add title score way 3 -----------------------:

# result = [dict(item, scoreeeeeeeeeeeeeee=sentiment_analyzer_scores(item['title'])) for item in news]
# print(result)


#this works append static object -------------:

# for item in news:
#     item.update({'scooooooooooooooooore': 'baaaaaaaaaaaaaaaaaad'})
# print(news)

**Plot Title Sentiment Score**

In [None]:
def graph_sentiment(text, score, graph_title):
    x = text
    y = [d.get('neg') for d in score]
    y1 = [d.get('neu') for d in score]
    y2 = [d.get('pos') for d in score]
    y3 = [d.get('compound') for d in score]

    fig = go.Figure(go.Bar(x=x, y=y, name='Negative', marker_color='#EE7674'))
    fig.add_trace(go.Bar(x=x, y=y1, name='Neutral', marker_color='#247BA0'))
    fig.add_trace(go.Bar(x=x, y=y2, name='Positive'))
    fig.add_trace(go.Bar(x=x, y=y3, name='Compound'))

    fig.update_layout(title=graph_title)
    fig.show()


graph_sentiment(news_df['title'], news_df['title_score'], 'Title Sentiment')

In [None]:
graph_sentiment(news_df['description'], news_df['desc_score'], 'Description Sentiment')

***How to interpret Compound Values?*** 
<p>The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.</p>

**Next Step: Populate the score on a daily basis**
<p> Use the mean of the headline score for each day </p>

In [None]:
#extract date and scores
news_score_df = news_df[['publishedAt', 'title_score', 'desc_score']].copy()

#remove time from datetime
news_score_df['publishedAt'] = pd.to_datetime(news_score_df['publishedAt'].str.split('T').str[0])

#convert object into datetime
#pd.to_datetime(news_score_df['publishedAt'])

#populate each score in nested title_score and desc_score into own column
news_score_df = pd.concat([news_score_df, 
                 pd.DataFrame((d for i, d in news_score_df['title_score'].iteritems()))], 
                 axis=1)

news_score_df = pd.concat([news_score_df, 
                 pd.DataFrame((d for i, d in news_score_df['desc_score'].iteritems()))], 
                 axis=1)


news_score_df.columns = ['publishedAt', 'title_score', 'desc_score', 
                         't_neg', 't_neu', 't_pos', 't_compound',
                         'd_neg', 'd_neu', 'd_pos', 'd_compound']

daily_score_df = news_score_df.groupby('publishedAt', as_index=True)[['t_neg', 't_neu', 't_pos', 't_compound',
                         'd_neg', 'd_neu', 'd_pos', 'd_compound']].mean().reset_index()

#add % change to each column
daily_score_df['t_neg_pc'] = daily_score_df['t_neg'].pct_change()

daily_score_df.head(30)

In [None]:
def daily_score_graph(graph_title, x, y1, y2, y3, y4):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y1,
                        mode='lines+markers',
                        name='neg',
                        line=dict(color='#EE7674')))
    fig.add_trace(go.Scatter(x=x, y=y2,
                        mode='lines+markers',
                        name='neu',
                        line=dict(color='#247BA0')))
    fig.add_trace(go.Scatter(x=x, y=y3,
                        mode='lines+markers', 
                        name='pos'))
    fig.add_trace(go.Scatter(x=x, y=y4,
                        mode='lines+markers', 
                        name='compound',
                        line=dict(width=4)))

    fig.update_layout(title=graph_title)
    fig.show()

daily_score_graph('Daily Title Score', daily_score_df['publishedAt'], 
                  daily_score_df['t_neg'], daily_score_df['t_neu'], daily_score_df['t_pos'], daily_score_df['t_compound'])

In [None]:
daily_score_graph('Daily Description Score', daily_score_df['publishedAt'], 
                  daily_score_df['d_neg'], daily_score_df['d_neu'], daily_score_df['d_pos'], daily_score_df['d_compound'])


**Next Step: Pull Daily price of S&P and Dow Jones**

reference: https://pypi.org/project/yfinance/

In [None]:
stock_data = pdr.get_data_yahoo(stock_ticker, start=date_list[-1], end=date_list[0])
stock_data_df = pd.DataFrame(stock_data).reset_index()

stock_data_df['Adj Close PC'] = stock_data_df['Adj Close'].pct_change()

print('Start Date: ' + str(date_list[-1]))
print('End Date: ' + str(date_list[0]))
stock_data_df.head(15)

In [None]:
#plot daily close for last 7 days
def stock_graph():
    x = stock_data_df['Date']
    y = stock_data_df['Adj Close']

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y,
                        mode='lines+markers',
                        name='Daily Close',
                        line=dict(color='#EE7674')))

    graph_title = str(company).upper() + ' Daily Close'
    fig.update_layout(title=graph_title)
    fig.show()

stock_graph()

#add label for day of week
#remove weekends

In [None]:
print(stock_data_df['Date'].dtypes)
print(daily_score_df['publishedAt'].dtypes)

**Next Step: Correlate the score of the daily movement of S&P and Dow Jones**

<p>Daily average Title score vs S&P <br />
   Daily average Description score vs S&P <br />
   Daily average Title score vs DJ <br />
   Daily average Description score vs DJ <br /></p>

**Compare % change each day between score and price**

In [228]:
price_score_df = pd.merge(daily_score_df, stock_data_df, how='left', left_on='publishedAt', right_on='Date')
price_score_df.head(15)

#saving data to csv
price_score_df.to_csv(r'/Users/susanqu/Desktop/Data Science/SentimentAnalysis/cmg.csv', index=False)

In [229]:
#Linear Regression Model
price_score_df = price_score_df.fillna(0)
price_score_df['t_neg_pc'] = price_score_df['t_neg_pc'].replace(np.inf, 0)
X = price_score_df['t_compound'].values.reshape(-1, 1)
Y = price_score_df['Adj Close PC'].values.reshape(-1, 1)
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)
Y_pred = linear_regressor.predict(X)
price_score_df['linear_fit'] = Y_pred

print('r: ' + str(linear_regressor.coef_))


r: [[0.01393979]]


In [230]:
#Plot Linear Model
def linear_graph():
    fig=go.Figure()
    fig.add_trace(go.Scatter(name='Title Compound Score vs Closing Price PC', x=price_score_df['t_compound'], y=price_score_df['Adj Close PC'], mode='markers'))
    fig.add_trace(go.Scatter(name='Best Fit', x=X, y=price_score_df['linear_fit'], mode='lines'))
    
    fig.update_layout(xaxis_title = 'Title Negative Score', yaxis_title = 'Closing Price % Change')
    fig.show()

linear_graph()

#need to fix line
#need to think about weekend data points

In [231]:
price_score_df.head(15)

Unnamed: 0,publishedAt,t_neg,t_neu,t_pos,t_compound,d_neg,d_neu,d_pos,d_compound,t_neg_pc,Date,Open,High,Low,Close,Adj Close,Volume,Adj Close PC,linear_fit
0,2020-02-24,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2020-02-24 00:00:00,897.419983,900.049988,883.929993,885.0,885.0,662300.0,0.0,-0.019388
1,2020-02-25,0.292,0.596,0.112,-0.2528,0.0,1.0,0.0,0.0,0.0,2020-02-25 00:00:00,890.0,891.909973,851.440002,856.01001,856.01001,736100.0,-0.032757,-0.022912
2,2020-02-26,0.0,0.974714,0.025286,0.060214,0.004429,0.970714,0.024857,0.099957,-1.0,2020-02-26 00:00:00,855.380005,868.789978,842.52002,847.289978,847.289978,570900.0,-0.010187,-0.018549
3,2020-02-27,0.064778,0.785556,0.149667,0.159833,0.110333,0.786667,0.103,0.089967,0.0,2020-02-27 00:00:00,811.390015,815.72998,750.72998,755.549988,755.549988,1351500.0,-0.108275,-0.01716
4,2020-02-28,0.0,0.769,0.231,0.2023,0.09,0.774,0.135,0.1779,-1.0,2020-02-28 00:00:00,734.5,774.840027,702.950012,773.580017,773.580017,1505900.0,0.023863,-0.016568
5,2020-03-01,0.0,0.783,0.217,0.3612,0.0,0.858,0.142,0.6486,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.014353
6,2020-03-02,0.0,0.903667,0.096333,0.180767,0.048667,0.884333,0.067,0.039,0.0,2020-03-02 00:00:00,779.52002,781.570007,739.900024,768.809998,768.809998,869200.0,-0.006166,-0.016868
7,2020-03-03,0.0,0.846,0.154,0.192529,0.031714,0.919143,0.049143,0.061129,0.0,2020-03-03 00:00:00,768.02002,781.780029,723.26001,736.070007,736.070007,779100.0,-0.042585,-0.016704
8,2020-03-04,0.0,0.9056,0.0944,0.1708,0.0485,0.8921,0.0595,0.08519,0.0,2020-03-04 00:00:00,764.429993,772.73999,740.690002,769.76001,769.76001,772600.0,0.04577,-0.017007
9,2020-03-05,0.0636,0.8822,0.0542,0.0454,0.0152,0.799,0.1858,0.63498,0.0,2020-03-05 00:00:00,750.01001,755.5,713.75,727.549988,727.549988,843800.0,-0.054835,-0.018755


In [232]:
#how today's news impact the next day's trading value?