In [39]:
# Import necessary libraries

import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np

In [40]:
# Load the dataset
file_path = '../data/raw_analyst_ratings.csv'
df = pd.read_csv(file_path)

In [9]:
# Create a copy of the original data
sentiment_data = df.copy()

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# calculate the sentiment of the headlines
sentiment_data['sentiment'] = sentiment_data['headline'].apply(
    lambda x: sia.polarity_scores(text=x)['compound']
)

# Inspect the sentiment data
sentiment_data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,sentiment
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A,0.0
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A,0.0
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A,0.0
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A,0.0
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A,0.296


In [41]:
sentiment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   Unnamed: 0          1407328 non-null  int64   
 1   headline            1407328 non-null  object  
 2   url                 1407328 non-null  object  
 3   publisher           1407328 non-null  object  
 4   date                1407328 non-null  object  
 5   stock               1407328 non-null  object  
 6   sentiment           1407328 non-null  float64 
 7   sentiment_category  1407328 non-null  category
dtypes: category(1), float64(1), int64(1), object(5)
memory usage: 76.5+ MB


In [11]:
# Categorize the sentiment scores
sentiment_data['sentiment_category'] = pd.cut(sentiment_data['sentiment'], bins=[-1, -0.5, -0.0001, 0.5, 1],
                                              labels=['Very Negative', 'Negative', 'Neutral', 'Positive'])

In [12]:
# Display catagorized sentiment data
sentiment_data['sentiment_category'].value_counts()

sentiment_category
Neutral          1070915
Negative          182864
Positive          109718
Very Negative      43831
Name: count, dtype: int64

In [15]:
# Inspect the earliest and latest dates in the sentiment_data
print(f'Earliest Date: {sentiment_data['date'].min()}')
print(f'Latest Date: {sentiment_data['date'].max()}')

Earliest Date: 2009-02-14 00:00:00
Latest Date: 2020-06-11 17:12:35-04:00


# Sentiment analysis for the different The main stocks

- I have made sentiment analyis for Apple, Amazon, Google, NVIDIA and Tesla.

In [59]:
# Import the utils modules from the scripts package

import os, importlib
from pathlib import Path
base_dir = Path(os.getcwd()).resolve().parent.parent
module_path = Path(base_dir / 'scripts' / 'utils.py')
spec = importlib.util.spec_from_file_location("utils", module_path)
utils = importlib.util.module_from_spec(spec)
spec.loader.exec_module(utils)


In [61]:
# Set the stocks of concern in a list and analyze them

stock_names = ['AAPL', 'AMZN', 'GOOG', 'NVDA', 'TSLA']
for stock_name in stock_names:
    print(f"Sentiment scores for {stock_name}")
    utils.stock_sentiment_analysis(sentiment_data=sentiment_data,
                             stock_name=stock_name)
    print()

Sentiment scores for AAPL
sentiment_category
Neutral          298
Negative          74
Positive          43
Very Negative     26
Name: count, dtype: int64

Sentiment scores for AMZN
sentiment_category
Neutral          201
Negative          38
Positive          28
Very Negative     11
Name: count, dtype: int64

Sentiment scores for GOOG
sentiment_category
Neutral          840
Negative         169
Positive         140
Very Negative     50
Name: count, dtype: int64

Sentiment scores for NVDA
sentiment_category
Neutral          2393
Negative          369
Positive          281
Very Negative     103
Name: count, dtype: int64

Sentiment scores for TSLA
sentiment_category
Neutral          1418
Negative          221
Positive          172
Very Negative      64
Name: count, dtype: int64



In [62]:
# Perform the date analysis for all the stocks in stock names
for stock_name in stock_names:
    print(f"Date range analysis for {stock_name}")
    utils.inspect_sentiment_dates(sentiment_data=sentiment_data,
                             stock_name=stock_name)
    print()

Date range analysis for AAPL
Earliest Date: 2020-03-09 00:00:00
Latest Date: 2020-06-10 11:33:26-04:00

Date range analysis for AMZN
Earliest Date: 2020-04-27 00:00:00
Latest Date: 2020-06-10 13:18:50-04:00

Date range analysis for GOOG
Earliest Date: 2018-11-13 00:00:00
Latest Date: 2020-06-10 15:25:13-04:00

Date range analysis for NVDA
Earliest Date: 2011-03-03 00:00:00
Latest Date: 2020-06-10 12:37:10-04:00

Date range analysis for TSLA
Earliest Date: 2019-07-01 00:00:00
Latest Date: 2020-06-10 17:02:47-04:00

