# =============================================================================
# TASK 3: CORRELATION BETWEEN NEWS AND STOCK MOVEMENT
# =============================================================================

In [2]:
# =============================================================================
# TASK 3: CORRELATION BETWEEN NEWS AND STOCK MOVEMENT
# =============================================================================

print(" INITIALIZING TASK 3: News Sentiment & Stock Correlation Analysis")
print("=" * 70)

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import sentiment analysis libraries
from textblob import TextBlob
import os

print(" All core libraries imported successfully!")

 INITIALIZING TASK 3: News Sentiment & Stock Correlation Analysis
 All core libraries imported successfully!


DEFINE CUSTOM FUNCTIONS

In [None]:
# =============================================================================
# DEFINE CUSTOM FUNCTIONS 
# =============================================================================

print(" Defining custom data loading and sentiment analysis functions...")

def load_news_data(filepath='../data/raw_analyst_ratings.csv'):
    """Load and preprocess financial news data"""
    df = pd.read_csv(filepath)
    df['date'] = pd.to_datetime(df['date'], format='mixed', utc=True)
    return df

def load_stock_data(symbol, data_dir='../data/Data/'):
    """Load stock price data for given symbol"""
    filepath = os.path.join(data_dir, f'{symbol}.csv')
    df = pd.read_csv(filepath)
    df['Date'] = pd.to_datetime(df['Date'])
    return df

def analyze_sentiment(text):
    """
    Analyze sentiment of financial headline using TextBlob.
    
    Returns:
        float: Polarity score between -1 (negative) and 1 (positive)
    """
    if not isinstance(text, str) or pd.isna(text):
        return 0.0
    
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def get_sentiment_label(score):
    """Convert sentiment score to label"""
    if score > 0.1:
        return 'positive'
    elif score < -0.1:
        return 'negative'
    else:
        return 'neutral'

print(" Custom functions defined successfully!")

 Defining custom data loading and sentiment analysis functions...
 Custom functions defined successfully!


In [5]:
# =============================================================================
# STEP 2: LOAD AND PREPARE DATASETS
# =============================================================================

print("\n STEP 2: LOADING AND PREPARING DATASETS")
print("-" * 50)

# Load news data
print(" Loading news data...")
news_df = load_news_data()
print(f"    News data loaded: {len(news_df):,} articles")

# Load stock data (using AAPL as primary example)
print(" Loading stock data...")
stock_df = load_stock_data('AAPL')
print(f"    Stock data loaded: {len(stock_df):,} trading days")

# Display basic info
print(f"\n DATASET OVERVIEW:")
print(f"   • News data shape: {news_df.shape}")
print(f"   • Stock data shape: {stock_df.shape}")
print(f"   • News date range: {news_df['date'].min()} to {news_df['date'].max()}")
print(f"   • Stock date range: {stock_df['Date'].min()} to {stock_df['Date'].max()}")

# Show sample data
print(f"\n NEWS DATA SAMPLE:")
print(news_df[['headline', 'date', 'stock']].head(3))
print(f"\n STOCK DATA SAMPLE:")
print(stock_df[['Date', 'Open', 'Close', 'Volume']].head(3))


 STEP 2: LOADING AND PREPARING DATASETS
--------------------------------------------------
 Loading news data...
    News data loaded: 1,407,328 articles
 Loading stock data...
    Stock data loaded: 3,774 trading days

 DATASET OVERVIEW:
   • News data shape: (1407328, 6)
   • Stock data shape: (3774, 6)
   • News date range: 2009-02-14 00:00:00+00:00 to 2020-06-11 21:12:35+00:00
   • Stock date range: 2009-01-02 00:00:00 to 2023-12-29 00:00:00

 NEWS DATA SAMPLE:
                                     headline                      date stock
0     Stocks That Hit 52-Week Highs On Friday 2020-06-05 14:30:54+00:00     A
1  Stocks That Hit 52-Week Highs On Wednesday 2020-06-03 14:45:20+00:00     A
2               71 Biggest Movers From Friday 2020-05-26 08:30:07+00:00     A

 STOCK DATA SAMPLE:
        Date      Open     Close      Volume
0 2009-01-02  2.575630  2.721686   746015200
1 2009-01-05  2.794266  2.836553  1181608400
2 2009-01-06  2.877641  2.789767  1289310400


In [6]:
# =============================================================================
# STEP 3: DATE ALIGNMENT AND NORMALIZATION
# =============================================================================

print("\n STEP 3: DATE ALIGNMENT AND NORMALIZATION")
print("-" * 55)

# Convert dates to consistent format and extract date-only
print(" Converting and aligning dates...")

# News data date processing
news_df['date_normalized'] = pd.to_datetime(news_df['date']).dt.tz_localize(None)
news_df['date_only'] = news_df['date_normalized'].dt.date

# Stock data date processing  
stock_df['date_only'] = pd.to_datetime(stock_df['Date']).dt.date

print(f"    News dates normalized: {news_df['date_only'].min()} to {news_df['date_only'].max()}")
print(f"    Stock dates normalized: {stock_df['date_only'].min()} to {stock_df['date_only'].max()}")

# Filter news for AAPL only to match our stock data
initial_news_count = len(news_df)
news_df = news_df[news_df['stock'] == 'AAPL']
filtered_news_count = len(news_df)

print(f"\n FILTERING FOR AAPL NEWS:")
print(f"   • Initial news articles: {initial_news_count:,}")
print(f"   • AAPL-specific articles: {filtered_news_count:,}")
print(f"   • Filtered out: {initial_news_count - filtered_news_count:,} articles")

# Check date overlap
news_dates = set(news_df['date_only'])
stock_dates = set(stock_df['date_only'])
overlap_dates = news_dates.intersection(stock_dates)

print(f"\n DATE OVERLAP ANALYSIS:")
print(f"   • Unique news dates: {len(news_dates)}")
print(f"   • Unique stock dates: {len(stock_dates)}") 
print(f"   • Overlapping dates: {len(overlap_dates)}")
print(f"   • Coverage: {len(overlap_dates)/len(stock_dates)*100:.1f}% of trading days have news")

# Show date range comparison
print(f"\n DATE RANGE COMPARISON:")
print(f"   News range:  {min(news_dates)} to {max(news_dates)}")
print(f"   Stock range: {min(stock_dates)} to {max(stock_dates)}")


 STEP 3: DATE ALIGNMENT AND NORMALIZATION
-------------------------------------------------------
 Converting and aligning dates...
    News dates normalized: 2009-02-14 to 2020-06-11
    Stock dates normalized: 2009-01-02 to 2023-12-29

 FILTERING FOR AAPL NEWS:
   • Initial news articles: 1,407,328
   • AAPL-specific articles: 441
   • Filtered out: 1,406,887 articles

 DATE OVERLAP ANALYSIS:
   • Unique news dates: 80
   • Unique stock dates: 3774
   • Overlapping dates: 61
   • Coverage: 1.6% of trading days have news

 DATE RANGE COMPARISON:
   News range:  2020-03-09 to 2020-06-10
   Stock range: 2009-01-02 to 2023-12-29
