In [49]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import os
from tqdm import tqdm

tqdm.pandas()

In [50]:
news_path = r"C:\Users\bless\OneDrive\Desktop\week-2\nova-financial-analysis\data\raw\raw_analyst_ratings.csv"
stock_dir = r"C:\Users\bless\OneDrive\Desktop\week-2\nova-financial-analysis\data\stocks"
stock_symbols = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA', 'TSLA']

In [51]:
news_df = pd.read_csv(news_path)

In [52]:
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
news_df = news_df.dropna(subset=['date'])

In [53]:
news_df['date'] = news_df['date'].dt.date
print(news_df.head())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

         date stock  
0  2020-06-05     A  
1  2020-06-03     A  
2  2020-05-26     A  
3  2020-05-22     A  
4  2020-05-22     A  


In [54]:
def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

In [55]:
news_df['sentiment'] = news_df['headline'].progress_apply(get_sentiment)

daily_sentiment = news_df.groupby(['date', 'stock']).agg({'sentiment': 'mean'}).reset_index()

print(daily_sentiment.head())

100%|██████████| 55987/55987 [00:16<00:00, 3355.20it/s]

         date stock  sentiment
0  2011-04-27   DGP   0.000000
1  2011-04-28   DEJ   0.000000
2  2011-04-28   ESR   0.136364
3  2011-04-29   AIA  -0.166667
4  2011-04-29   GDL   0.500000





In [56]:
def load_stock_data(stock_symbol):
    path = os.path.join(stock_dir, f"{stock_symbol}_historical_data.csv")
    df = pd.read_csv(path)


    df['Date'] = pd.to_datetime(df['Date']).dt.date


    df = df.sort_values('Date')
    df['daily_return_next'] = df['Close'].pct_change().shift(-1)  
    return df[['Date', 'daily_return_next']]


In [57]:
results = {}

for stock in stock_symbols:
    print(f"\nProcessing {stock}...")
    stock_df = load_stock_data(stock)
 
    stock_sentiment = daily_sentiment[daily_sentiment['stock'] == stock].copy()

    merged_df = pd.merge(stock_sentiment, stock_df, left_on='date', right_on='Date', how='inner')

    merged_df.dropna(subset=['sentiment', 'daily_return_next'], inplace=True)
    print(f"Merged rows for {stock}: {len(merged_df)}")

    if len(merged_df) > 0:
        corr = merged_df['sentiment'].corr(merged_df['daily_return_next'])
    else:
        corr = np.nan

    results[stock] = corr
    print(f"Correlation for {stock}: {corr}")






Processing AAPL...
Merged rows for AAPL: 2
Correlation for AAPL: -1.0

Processing AMZN...
Merged rows for AMZN: 2
Correlation for AMZN: -0.9999999999999998

Processing GOOG...
Merged rows for GOOG: 5
Correlation for GOOG: 0.10930744944084761

Processing META...
Merged rows for META: 0
Correlation for META: nan

Processing MSFT...
Merged rows for MSFT: 0
Correlation for MSFT: nan

Processing NVDA...
Merged rows for NVDA: 4
Correlation for NVDA: 0.36111448918472894

Processing TSLA...
Merged rows for TSLA: 1
Correlation for TSLA: nan


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [58]:
print("\n=== Summary of Correlations ===")
for stock, corr in results.items():
    print(f"{stock}: {corr}")


=== Summary of Correlations ===
AAPL: -1.0
AMZN: -0.9999999999999998
GOOG: 0.10930744944084761
META: nan
MSFT: nan
NVDA: 0.36111448918472894
TSLA: nan
