In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import os
import sys
sys.path.append("../scripts/")  # Add the scripts folder to the path

import utils as util
import eda

  from scipy.stats import fisher_exact


In [2]:
# Load datasets
all_data_path1 = '../data/raw_analyst_ratings.csv'
all_data_path2 = '../data/AAPL_historical_data.csv'
news_data = util.read_csv_file(all_data_path1)
stock_data = util.read_csv_file(all_data_path2)
news_data = news_data.get("data")
stock_data = stock_data.get("data")

# Display the first few rows of each dataset
print(news_data.head())
print(stock_data.head())

                                            headline  \
0            Stocks That Hit 52-Week Highs On Friday   
1         Stocks That Hit 52-Week Highs On Wednesday   
2                      71 Biggest Movers From Friday   
3       46 Stocks Moving In Friday's Mid-Day Session   
4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00     A  
3  2020-05-22 12:45:06-04:00     A  
4  2020-05-22 11:38:59-04:0

In [13]:
aapl_news = news_data[news_data['stock'] == 'AAPL']
print(aapl_news)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')


In [None]:
# Convert date columns to datetime format
aapl_news = eda.parse_dates(aapl_news, column_name="date")
stock_data = eda.parse_dates(stock_data, column_name="Date")

# Set the date as the index for easy alignment
aapl_news.set_index('date', inplace=True)
stock_data.set_index('Date', inplace=True)

# Display the first few rows to confirm alignment
print(aapl_news.head())
print(stock_data.head())

                                                                    headline  \
date                                                                           
2020-06-05 10:30:54-04:00            Stocks That Hit 52-Week Highs On Friday   
2020-06-03 10:45:20-04:00         Stocks That Hit 52-Week Highs On Wednesday   
2020-05-26 04:30:07-04:00                      71 Biggest Movers From Friday   
2020-05-22 12:45:06-04:00       46 Stocks Moving In Friday's Mid-Day Session   
2020-05-22 11:38:59-04:00  B of A Securities Maintains Neutral on Agilent...   

                                                                         url  \
date                                                                           
2020-06-05 10:30:54-04:00  https://www.benzinga.com/news/20/06/16190091/s...   
2020-06-03 10:45:20-04:00  https://www.benzinga.com/news/20/06/16170189/s...   
2020-05-26 04:30:07-04:00  https://www.benzinga.com/news/20/05/16103463/7...   
2020-05-22 12:45:06-04:00  https://www.

In [4]:
# Function to calculate sentiment score
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Returns a value between -1 (negative) and 1 (positive)

# Apply sentiment analysis to the headlines
aapl_news['sentiment'] =aapl_news['headline'].apply(get_sentiment)

# Reset the index to prepare for aggregation
news_data.reset_index(inplace=True)

In [None]:
# Calculate daily returns
stock_data['daily_return'] = stock_data['Close'].pct_change()
stock_data.dropna(inplace=True)  # Drop NA values that result from the pct_change