# Task 3: Correlation Analysis Between News Sentiment and Stock Movements
# Comprehensive Integration of Financial News and Stock Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# NLP and Sentiment Analysis
import nltk
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
import re

# Statistical Analysis
from scipy.stats import pearsonr, spearmanr
from scipy import stats
import statsmodels.api as sm

# Download required NLTK data
nltk.download('vader_lexicon', quiet=True)

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=== CORRELATION ANALYSIS: NEWS SENTIMENT & STOCK MOVEMENTS ===")
print("All libraries imported successfully!")

=== CORRELATION ANALYSIS: NEWS SENTIMENT & STOCK MOVEMENTS ===
All libraries imported successfully!


2 :Enhanced Sentiment Analysis: Enhanced Sentiment Analysis for Financial Headlines

In [3]:

class FinancialSentimentAnalyzer:
    """
    Comprehensive sentiment analyzer tailored for financial news
    """
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()
        # Financial lexicon enhancements
        self.positive_financial_terms = {
            'bullish', 'surge', 'rally', 'gain', 'profit', 'growth', 'beat', 'outperform',
            'upgrade', 'buy', 'outperform', 'strong', 'positive', 'optimistic', 'recovery'
        }
        self.negative_financial_terms = {
            'bearish', 'plunge', 'drop', 'loss', 'decline', 'miss', 'underperform',
            'downgrade', 'sell', 'weak', 'negative', 'pessimistic', 'crash', 'slump'
        }
    
    def analyze_sentiment(self, text):
        """
        Perform comprehensive sentiment analysis using multiple methods
        """
        if pd.isna(text) or text == '':
            return 0.0
        
        text = str(text).lower()
        
        # Method 1: TextBlob sentiment
        try:
            blob = TextBlob(text)
            textblob_score = blob.sentiment.polarity
        except:
            textblob_score = 0.0
        
        # Method 2: VADER sentiment (specifically trained for social media/text)
        vader_scores = self.sia.polarity_scores(text)
        vader_score = vader_scores['compound']
        
        # Method 3: Financial term boosting
        financial_boost = 0.0
        positive_count = sum(1 for term in self.positive_financial_terms if term in text)
        negative_count = sum(1 for term in self.negative_financial_terms if term in text)
        
        if positive_count > negative_count:
            financial_boost = 0.1
        elif negative_count > positive_count:
            financial_boost = -0.1
        
        # Combined score (weighted average)
        combined_score = (textblob_score * 0.4 + vader_score * 0.5 + financial_boost * 0.1)
        
        # Normalize to [-1, 1]
        return max(-1.0, min(1.0, combined_score))
    
    def get_sentiment_label(self, score):
        """Convert sentiment score to categorical label"""
        if score > 0.1:
            return 'positive'
        elif score < -0.1:
            return 'negative'
        else:
            return 'neutral'

# Initialize sentiment analyzer
sentiment_analyzer = FinancialSentimentAnalyzer()

print("Financial Sentiment Analyzer initialized successfully!")

Financial Sentiment Analyzer initialized successfully!


3: Load and Prepare Datasets

In [5]:
# Load and Prepare Datasets for Correlation Analysis

def load_and_prepare_datasets():
    """
    Load both news and stock datasets and prepare for correlation analysis
    """
    # Load news data (from Task 1)
    try:
        news_df = pd.read_csv('../data/raw_analyst_ratings.csv')
        unnamed_cols = news_df.columns[news_df.columns.str.contains('Unnamed', case=False)]
        news_df = news_df.drop(columns=unnamed_cols, axis=1)
        print(f"‚úì News data loaded: {len(news_df):,} articles")
    except FileNotFoundError:
        print("‚úó News data file not found. Please ensure 'data/financial_news.csv' exists.")
        return None, None
    
    # Load stock data (from Task 2)
    stock_symbols = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']
    stock_data = {}
    
    for symbol in stock_symbols:
        try:
            stock_df = pd.read_csv(f'../data/yfinancedata/{symbol}.csv', index_col=0, parse_dates=True)
            stock_data[symbol] = stock_df
            print(f"‚úì {symbol} data loaded: {len(stock_df)} trading days")
        except FileNotFoundError:
            print(f"‚úó {symbol} data file not found")
            continue
    
    return news_df, stock_data

print("\n" + "="*60)
print("LOADING AND PREPARING DATASETS")
print("="*60)

news_df, stock_data = load_and_prepare_datasets()

if news_df is not None and stock_data:
    print(f"\nüìä Dataset Summary:")
    print(f"   ‚Ä¢ News Articles: {len(news_df):,}")
    print(f"   ‚Ä¢ Stocks Loaded: {len(stock_data)}")
    print(f"   ‚Ä¢ News Date Range: {news_df['date'].min()} to {news_df['date'].max()}")
    
    # Display sample of news data
    print(f"\nSample News Headlines:")
    for i, headline in enumerate(news_df['headline'].head(5)):
        print(f"   {i+1}. {headline}")
else:
    print("‚ùå Failed to load datasets. Please check file paths and availability.")


LOADING AND PREPARING DATASETS
‚úì News data loaded: 1,407,328 articles
‚úì AAPL data loaded: 3774 trading days
‚úì AMZN data loaded: 3774 trading days
‚úì GOOG data loaded: 3774 trading days
‚úì META data loaded: 2923 trading days
‚úì MSFT data loaded: 3774 trading days
‚úì NVDA data loaded: 3774 trading days

üìä Dataset Summary:
   ‚Ä¢ News Articles: 1,407,328
   ‚Ä¢ Stocks Loaded: 6
   ‚Ä¢ News Date Range: 2009-02-14 00:00:00 to 2020-06-11 17:12:35-04:00

Sample News Headlines:
   1. Stocks That Hit 52-Week Highs On Friday
   2. Stocks That Hit 52-Week Highs On Wednesday
   3. 71 Biggest Movers From Friday
   4. 46 Stocks Moving In Friday's Mid-Day Session
   5. B of A Securities Maintains Neutral on Agilent Technologies, Raises Price Target to $88


4: Date Alignment and Data Integration

In [12]:
# Date Alignment and Data Integration

def align_and_integrate_data(news_df, stock_data):
    """
    Align news and stock data by dates and integrate for analysis
    """
    # Convert news date to datetime and normalize
    print("Processing news data dates...")
    news_df['date'] = pd.to_datetime(
        news_df['date'],
        format='mixed',   # allow mixed formats
        utc=True,         # output in UTC
        errors='coerce'   # convert problematic dates to NaT instead of raising error
    )
    
    # Remove any rows with invalid dates
    news_df = news_df.dropna(subset=['date'])
    news_df['date_normalized'] = news_df['date'].dt.date
    print(f"‚úì News data processed: {len(news_df):,} articles with valid dates")
    
    # Perform sentiment analysis on all headlines
    print("Performing sentiment analysis on news headlines...")
    news_df['sentiment_score'] = news_df['headline'].apply(
        lambda x: sentiment_analyzer.analyze_sentiment(x)
    )
    news_df['sentiment_label'] = news_df['sentiment_score'].apply(
        lambda x: sentiment_analyzer.get_sentiment_label(x)
    )
    print("‚úì Sentiment analysis completed")
    
    # Create integrated dataset for each stock
    integrated_data = {}
    
    for symbol, stock_df in stock_data.items():
        print(f"\nIntegrating data for {symbol}...")
        
        # Ensure stock data has date index and create normalized date column
        stock_df = stock_df.copy()
        stock_df.index = pd.to_datetime(stock_df.index)
        stock_df['date_normalized'] = stock_df.index.date
        
        # Calculate daily returns
        stock_df['daily_return'] = stock_df['Close'].pct_change() * 100
        stock_df['daily_return_abs'] = stock_df['daily_return'].abs()
        
        # Filter news for this specific stock
        # First try exact symbol match
        stock_news = news_df[news_df['stock'] == symbol].copy()
        
        if len(stock_news) == 0:
            print(f"   ‚ö† No direct news found for {symbol}, using all financial news...")
            # If no direct stock match, use all financial news
            stock_news = news_df.copy()
        else:
            print(f"   ‚úì Found {len(stock_news):,} articles specifically for {symbol}")
        
        # Aggregate daily sentiment
        if len(stock_news) > 0:
            # Ensure date_normalized is consistent type for grouping
            stock_news['date_normalized'] = pd.to_datetime(stock_news['date_normalized'])
            
            daily_sentiment = stock_news.groupby('date_normalized').agg({
                'sentiment_score': ['mean', 'count', 'std'],
                'headline': 'count'
            }).round(4)
            
            # Flatten column names
            daily_sentiment.columns = ['sentiment_mean', 'sentiment_count', 'sentiment_std', 'article_count']
            daily_sentiment = daily_sentiment.reset_index()
            
            # Reset stock_df index for merging and ensure consistent date type
            stock_df_reset = stock_df.reset_index()
            stock_df_reset['date_normalized'] = pd.to_datetime(stock_df_reset['date_normalized'])
            
            print(f"   ‚Ä¢ Stock data dates: {stock_df_reset['date_normalized'].dtype}")
            print(f"   ‚Ä¢ Sentiment data dates: {daily_sentiment['date_normalized'].dtype}")
            
            # Merge with stock data using pd.merge
            merged_data = pd.merge(
                stock_df_reset,
                daily_sentiment,
                on='date_normalized',
                how='inner'  # Only keep dates that exist in both datasets
            )
            
            if len(merged_data) == 0:
                print(f"   ‚ö† No overlapping dates found between news and stock data for {symbol}")
                print(f"   ‚Ä¢ Stock date range: {stock_df_reset['date_normalized'].min()} to {stock_df_reset['date_normalized'].max()}")
                print(f"   ‚Ä¢ News date range: {daily_sentiment['date_normalized'].min()} to {daily_sentiment['date_normalized'].max()}")
                continue
            
            # Calculate lagged correlations (news today vs returns tomorrow)
            merged_data['returns_tomorrow'] = merged_data['daily_return'].shift(-1)
            merged_data['returns_next_week'] = merged_data['daily_return'].shift(-5)
            
            # Set date as index for consistency
            merged_data.set_index('date_normalized', inplace=True)
            
            integrated_data[symbol] = merged_data
            print(f"   ‚úì Integrated {len(merged_data)} days of data")
            print(f"   ‚úì News coverage: {merged_data['article_count'].sum():,} articles")
            print(f"   ‚úì Date range: {merged_data.index.min()} to {merged_data.index.max()}")
        else:
            print(f"   ‚úó No news data available for integration with {symbol}")
    
    return integrated_data

print("\n" + "="*60)
print("DATE ALIGNMENT AND DATA INTEGRATION")
print("="*60)

integrated_data = align_and_integrate_data(news_df, stock_data)

if integrated_data:
    print(f"\n‚úÖ SUCCESSFULLY INTEGRATED DATA FOR {len(integrated_data)} STOCKS")
    
    # Display detailed summary for each stock
    for symbol, data in integrated_data.items():
        print(f"\nüìä {symbol} Integration Summary:")
        print(f"   ‚Ä¢ Integrated Days: {len(data):,}")
        print(f"   ‚Ä¢ Date Range: {data.index.min()} to {data.index.max()}")
        print(f"   ‚Ä¢ Total Articles: {data['article_count'].sum():,}")
        print(f"   ‚Ä¢ Average Articles/Day: {data['article_count'].mean():.1f}")
        print(f"   ‚Ä¢ Average Sentiment: {data['sentiment_mean'].mean():.3f}")
        print(f"   ‚Ä¢ Average Daily Return: {data['daily_return'].mean():.3f}%")
        
        # Check data quality
        missing_sentiment = data['sentiment_mean'].isna().sum()
        missing_returns = data['daily_return'].isna().sum()
        if missing_sentiment > 0 or missing_returns > 0:
            print(f"   ‚Ä¢ Data Quality: {missing_sentiment} missing sentiment, {missing_returns} missing returns")
    
    # Display sample data from first stock
    first_symbol = list(integrated_data.keys())[0]
    sample_data = integrated_data[first_symbol]
    
    print(f"\n" + "="*50)
    print(f"SAMPLE INTEGRATED DATA FOR {first_symbol}")
    print("="*50)
    print(sample_data[['Close', 'daily_return', 'sentiment_mean', 'article_count']].head(10))
    
else:
    print("‚ùå No data integration successful. Checking data compatibility...")
    
    # Debug information
    if news_df is not None:
        print(f"\nüì∞ News Data Info:")
        print(f"   ‚Ä¢ Total articles: {len(news_df):,}")
        print(f"   ‚Ä¢ Unique stocks in news: {news_df['stock'].nunique()}")
        print(f"   ‚Ä¢ Sample stocks: {news_df['stock'].value_counts().head(10).index.tolist()}")
        print(f"   ‚Ä¢ News date range: {news_df['date_normalized'].min()} to {news_df['date_normalized'].max()}")
        print(f"   ‚Ä¢ News date type: {news_df['date_normalized'].dtype}")
    
    if stock_data:
        print(f"\nüìà Stock Data Info:")
        for symbol, data in stock_data.items():
            data_copy = data.copy()
            data_copy.index = pd.to_datetime(data_copy.index)
            data_copy['date_normalized'] = data_copy.index.date
            print(f"   ‚Ä¢ {symbol}: {len(data)} days, {data_copy['date_normalized'].min()} to {data_copy['date_normalized'].max()}")
            print(f"   ‚Ä¢ {symbol} date type: {data_copy['date_normalized'].dtype}")
    
    # Check for overlapping date ranges
    if news_df is not None and stock_data:
        print(f"\nüîç Checking for date range overlaps...")
        news_min_date = news_df['date_normalized'].min()
        news_max_date = news_df['date_normalized'].max()
        
        for symbol, data in stock_data.items():
            data_copy = data.copy()
            data_copy.index = pd.to_datetime(data_copy.index)
            stock_min_date = data_copy.index.min().date()
            stock_max_date = data_copy.index.max().date()
            
            overlap_start = max(news_min_date, stock_min_date)
            overlap_end = min(news_max_date, stock_max_date)
            
            if overlap_start <= overlap_end:
                print(f"   ‚Ä¢ {symbol}: OVERLAP FOUND - {overlap_start} to {overlap_end}")
            else:
                print(f"   ‚Ä¢ {symbol}: NO OVERLAP - News: {news_min_date}-{news_max_date}, Stock: {stock_min_date}-{stock_max_date}")


DATE ALIGNMENT AND DATA INTEGRATION
Processing news data dates...
‚úì News data processed: 1,407,328 articles with valid dates
Performing sentiment analysis on news headlines...
‚úì Sentiment analysis completed

Integrating data for AAPL...
   ‚úì Found 441 articles specifically for AAPL
   ‚Ä¢ Stock data dates: datetime64[ns]
   ‚Ä¢ Sentiment data dates: datetime64[ns]
   ‚úì Integrated 61 days of data
   ‚úì News coverage: 415 articles
   ‚úì Date range: 2020-03-09 00:00:00 to 2020-06-10 00:00:00

Integrating data for AMZN...
   ‚úì Found 278 articles specifically for AMZN
   ‚Ä¢ Stock data dates: datetime64[ns]
   ‚Ä¢ Sentiment data dates: datetime64[ns]
   ‚úì Integrated 28 days of data
   ‚úì News coverage: 265 articles
   ‚úì Date range: 2020-04-27 00:00:00 to 2020-06-10 00:00:00

Integrating data for GOOG...
   ‚úì Found 1,199 articles specifically for GOOG
   ‚Ä¢ Stock data dates: datetime64[ns]
   ‚Ä¢ Sentiment data dates: datetime64[ns]
   ‚úì Integrated 352 days of data
   