# Financial News Sentiment Analysis

This notebook analyzes the correlation between financial news sentiment and stock price movements.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
import os

# Add src directory to path
sys.path.append(os.path.abspath('..'))

from src.sentiment_analysis import NewsSentimentAnalyzer
from src.stock_data_loader import load_stock_data

# Set plot style
plt.style.use('default')
sns.set_palette('husl')

## Load Data

First, we'll load both the news data and stock data.

In [2]:
# Load news data
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
print(f"News data shape: {news_data.shape}")
news_data.head()

News data shape: (1407328, 6)


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [3]:
# Load stock data for all symbols
symbols = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA', 'TSLA']
stock_data = load_stock_data(symbols, data_dir='../data')

for symbol in symbols:
    if symbol in stock_data:
        print(f"{symbol} data range: {stock_data[symbol].index.min()} to {stock_data[symbol].index.max()}")
    else:
        print(f"No data found for {symbol}")

AAPL data range: 0 to 10997
AMZN data range: 0 to 6845
GOOG data range: 0 to 5019
META data range: 0 to 2925
MSFT data range: 0 to 9671
NVDA data range: 0 to 6420
TSLA data range: 0 to 3544


## Initialize Sentiment Analyzer and Calculate Sentiment Scores

We'll create an instance of NewsSentimentAnalyzer and calculate sentiment scores for all headlines.

In [4]:
# Initialize analyzer
analyzer = NewsSentimentAnalyzer(news_data, stock_data)

# Calculate sentiment scores
news_with_sentiment = analyzer.calculate_sentiment()

# Display sentiment distribution
plt.figure(figsize=(10, 6))
news_with_sentiment['sentiment_category'].value_counts().plot(kind='bar')
plt.title('Distribution of News Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

ValueError: time data "2020-05-22 00:00:00" doesn't match format "%Y-%m-%d %H:%M:%S%z", at position 10. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

## Analyze Each Stock

For each stock, we'll:
1. Generate a sentiment summary
2. Analyze correlation between sentiment and returns
3. Plot sentiment analysis results

In [5]:
for symbol in symbols:
    if symbol not in stock_data:
        print(f"Skipping {symbol} - no stock data available")
        continue
        
    print(f"\n{'='*80}\nAnalyzing {symbol}\n{'='*80}")
    
    try:
        # Get sentiment summary
        summary = analyzer.get_sentiment_summary(symbol)
        print("\nSentiment Summary:")
        print(f"Overall Sentiment: {summary['overall_sentiment']:.2f}")
        print(f"Total Articles: {summary['total_articles']}")
        print("\nSentiment Distribution:")
        print(summary['sentiment_distribution'])
        
        # Analyze correlation
        correlation = analyzer.analyze_correlation(symbol)
        print("\nCorrelation Analysis:")
        print(f"Overall Correlation: {correlation['overall_correlation']:.3f}")
        print(f"Best Lag: {correlation['best_lag']} days")
        print(f"Best Lag Correlation: {correlation['best_lag_correlation']:.3f}")
        
        # Plot sentiment analysis
        analyzer.plot_sentiment_analysis(symbol)
        
        # Plot time lag correlation
        analyzer.plot_time_lag_correlation(symbol)
        
    except Exception as e:
        print(f"Error analyzing {symbol}: {str(e)}")


Analyzing AAPL
Error analyzing AAPL: name 'analyzer' is not defined

Analyzing AMZN
Error analyzing AMZN: name 'analyzer' is not defined

Analyzing GOOG
Error analyzing GOOG: name 'analyzer' is not defined

Analyzing META
Error analyzing META: name 'analyzer' is not defined

Analyzing MSFT
Error analyzing MSFT: name 'analyzer' is not defined

Analyzing NVDA
Error analyzing NVDA: name 'analyzer' is not defined

Analyzing TSLA
Error analyzing TSLA: name 'analyzer' is not defined


## Cross-Stock Analysis

Compare sentiment and correlation metrics across all stocks.

In [7]:
# Collect metrics for all stocks
metrics = []
for symbol in symbols:
    if symbol not in stock_data:
        continue
        
    try:
        summary = analyzer.get_sentiment_summary(symbol)
        correlation = analyzer.analyze_correlation(symbol)
        
        metrics.append({
            'Symbol': symbol,
            'Overall Sentiment': summary['overall_sentiment'],
            'Total Articles': summary['total_articles'],
            'Positive %': summary['sentiment_distribution']['positive'],
            'Neutral %': summary['sentiment_distribution']['neutral'],
            'Negative %': summary['sentiment_distribution']['negative'],
            'Correlation': correlation['overall_correlation'],
            'Best Lag': correlation['best_lag'],
            'Best Lag Correlation': correlation['best_lag_correlation']
        })
    except Exception as e:
        print(f"Error collecting metrics for {symbol}: {str(e)}")

if metrics:
    metrics_df = pd.DataFrame(metrics)
    metrics_df.set_index('Symbol', inplace=True)

    # Plot comparison metrics
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Overall sentiment
    metrics_df['Overall Sentiment'].plot(kind='bar', ax=axes[0,0])
    axes[0,0].set_title('Overall Sentiment by Stock')
    axes[0,0].set_ylabel('Sentiment Score')

    # News distribution
    metrics_df[['Positive %', 'Neutral %', 'Negative %']].plot(kind='bar', ax=axes[0,1])
    axes[0,1].set_title('Sentiment Distribution by Stock')
    axes[0,1].set_ylabel('Percentage')

    # Correlation
    metrics_df['Correlation'].plot(kind='bar', ax=axes[1,0])
    axes[1,0].set_title('Sentiment-Return Correlation by Stock')
    axes[1,0].set_ylabel('Correlation Coefficient')

    # Best lag correlation
    metrics_df['Best Lag Correlation'].plot(kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Best Lag Correlation by Stock')
    axes[1,1].set_ylabel('Correlation Coefficient')

    plt.tight_layout()
    plt.show()

    # Display metrics table
    display(metrics_df.round(3))
else:
    print("No metrics collected - check if sentiment analysis completed successfully")

Error collecting metrics for AAPL: name 'analyzer' is not defined
Error collecting metrics for AMZN: name 'analyzer' is not defined
Error collecting metrics for GOOG: name 'analyzer' is not defined
Error collecting metrics for META: name 'analyzer' is not defined
Error collecting metrics for MSFT: name 'analyzer' is not defined
Error collecting metrics for NVDA: name 'analyzer' is not defined
Error collecting metrics for TSLA: name 'analyzer' is not defined
No metrics collected - check if sentiment analysis completed successfully
