In [1]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from task3_correlation_analysis import CorrelationAnalyzer

In [2]:

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Task 3: Correlation Analysis between News Sentiment and Stock Returns")

Task 3: Correlation Analysis between News Sentiment and Stock Returns


In [None]:
# Load and explore the news data
print("Loading news data...")
news_df = pd.read_csv(r'C:\Users\admin\finaniacal-week1\data\raw_analyst_ratings.csv')
print("News data columns:", news_df.columns.tolist())
print("News data shape:", news_df.shape)
print("\nFirst few rows of news data:")
display(news_df.head())

Loading and preparing data...


In [None]:
# Load and explore the stock data
print("Loading stock data...")
stock_df = pd.read_csv(r'C:\Users\admin\finaniacal-week1\data\NVDA.csv')
print("Stock data columns:", stock_df.columns.tolist())
print("Stock data shape:", stock_df.shape)
print("\nFirst few rows of stock data:")
display(stock_df.head())

In [None]:
# Data preprocessing and cleaning
print("Preprocessing data...")

# Check for missing values in news data
print("Missing values in news data:")
print(news_df.isnull().sum())

# Check for missing values in stock data
print("\nMissing values in stock data:")
print(stock_df.isnull().sum())

# Check date ranges
print(f"\nNews date range: {news_df['date'].min()} to {news_df['date'].max()}")
print(f"Stock date range: {stock_df['Date'].min()} to {stock_df['Date'].max()}")

Loaded news data with 1407328 records
Loaded stock data with 3774 records
Error loading data: 'publication_date'


KeyError: 'publication_date'

In [None]:
# Initialize the correlation analyzer with correct column mappings
analyzer = CorrelationAnalyzer(
    news_data_path=r'C:\Users\admin\finaniacal-week1\data\raw_analyst_ratings.csv',
    stock_data_path=r'C:\Users\admin\finaniacal-week1\data\NVDA.csv'
)

# Manually set the data with proper column names
analyzer.news_data = news_df.rename(columns={'date': 'publication_date'})
analyzer.stock_data = stock_df

print("Data loaded successfully!")
print(f"News records: {len(analyzer.news_data)}")
print(f"Stock records: {len(analyzer.stock_data)}")

['Date', 'Close', 'High', 'Low', 'Open', 'Volume']


In [None]:
# Step 1: Normalize dates
print("Normalizing dates...")
analyzer._normalize_dates()

# Check the normalized data
print(f"News date range after normalization: {analyzer.news_data['publication_date'].min()} to {analyzer.news_data['publication_date'].max()}")
print(f"Stock date range after normalization: {analyzer.stock_data['Date'].min()} to {analyzer.stock_data['Date'].max()}")

In [None]:
# Step 2: Perform sentiment analysis on headlines
print("Performing sentiment analysis...")
sentiment_data = analyzer.analyze_sentiment(text_column='headline')

print("Sentiment analysis results:")
display(sentiment_data.head(10))

print(f"\nSentiment statistics:")
print(f"Average sentiment: {sentiment_data['avg_sentiment'].mean():.3f}")
print(f"Sentiment std: {sentiment_data['avg_sentiment'].std():.3f}")
print(f"Min sentiment: {sentiment_data['avg_sentiment'].min():.3f}")
print(f"Max sentiment: {sentiment_data['avg_sentiment'].max():.3f}")

In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(sentiment_data['avg_sentiment'], bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Average Daily Sentiment')
plt.ylabel('Frequency')
plt.title('Distribution of Daily Average Sentiment')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(sentiment_data['publication_date'], sentiment_data['avg_sentiment'], marker='o', linewidth=1, markersize=2)
plt.xlabel('Date')
plt.ylabel('Average Sentiment')
plt.title('Sentiment Over Time')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Step 3: Compute daily returns
print("Computing daily returns...")
returns_data = analyzer.compute_daily_returns(price_column='Close')

print("Returns data sample:")
display(returns_data.head(10))

print(f"\nReturns statistics:")
print(f"Average daily return: {returns_data['daily_return'].mean():.3f}%")
print(f"Returns std: {returns_data['daily_return'].std():.3f}%")
print(f"Min return: {returns_data['daily_return'].min():.3f}%")
print(f"Max return: {returns_data['daily_return'].max():.3f}%")

In [None]:
# Visualize stock returns
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(returns_data['Date'], returns_data['daily_return'], color='green', linewidth=1)
plt.xlabel('Date')
plt.ylabel('Daily Return (%)')
plt.title('NVDA Daily Returns Over Time')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(returns_data['daily_return'], bins=30, alpha=0.7, edgecolor='black', color='green')
plt.xlabel('Daily Return (%)')
plt.ylabel('Frequency')
plt.title('Distribution of Daily Returns')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Step 4: Merge sentiment and returns data
print("Merging sentiment and returns data...")
merged_data = analyzer.merge_sentiment_returns()

print("Merged data sample:")
display(merged_data.head(10))

print(f"\nMerged dataset statistics:")
print(f"Total matching days: {len(merged_data)}")
print(f"Date range: {merged_data['publication_date'].min()} to {merged_data['publication_date'].max()}")

In [None]:
# Step 5: Calculate correlation
print("Calculating correlation coefficients...")
correlation_results = analyzer.calculate_correlation(merged_data)

print("Correlation Results:")
for key, value in correlation_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

In [None]:
# Step 6: Create comprehensive visualizations
print("Generating correlation visualizations...")
analyzer.visualize_correlation(merged_data, save_path='../results/task3_correlation_analysis.png')

In [None]:
# Additional analysis: Sentiment vs Returns by publisher
if 'publisher' in analyzer.news_data.columns:
    print("Analyzing sentiment by publisher...")
    
    # Get top publishers by article count
    top_publishers = analyzer.news_data['publisher'].value_counts().head(10).index.tolist()
    
    # Create publisher-specific analysis
    publisher_correlations = {}
    
    for publisher in top_publishers:
        publisher_news = analyzer.news_data[analyzer.news_data['publisher'] == publisher]
        publisher_sentiment = publisher_news.groupby('publication_date')['sentiment'].mean().reset_index()
        
        # Merge with returns
        publisher_merged = pd.merge(
            publisher_sentiment,
            analyzer.stock_data[['Date', 'Close']],
            left_on='publication_date',
            right_on='Date',
            how='inner'
        )
        
        # Calculate returns
        publisher_merged['daily_return'] = publisher_merged['Close'].pct_change() * 100
        publisher_merged = publisher_merged.dropna()
        
        if len(publisher_merged) > 5:  # Require minimum data points
            corr, p_value = pearsonr(publisher_merged['sentiment'], publisher_merged['daily_return'])
            publisher_correlations[publisher] = {
                'correlation': corr,
                'p_value': p_value,
                'articles': len(publisher_news),
                'days_with_data': len(publisher_merged)
            }
    
    # Display publisher correlations
    publisher_corr_df = pd.DataFrame(publisher_correlations).T.sort_values('correlation', ascending=False)
    print("\nPublisher-specific correlations:")
    display(publisher_corr_df)

In [None]:
# Section 6: Compute Daily Returns and Prepare for Correlation
print("Computing Daily Returns and Merging Data...")

# Compute daily returns
returns_data = analyzer.compute_daily_returns()

print("\nReturns Statistics:")
print(f"Average Daily Return: {returns_data['daily_return'].mean():.4f}%")
print(f"Return Std Dev: {returns_data['daily_return'].std():.4f}%")
print(f"Maximum Daily Return: {returns_data['daily_return'].max():.4f}%")
print(f"Minimum Daily Return: {returns_data['daily_return'].min():.4f}%")
# Display returns data
display(returns_data.head(10))

# Plot returns distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(returns_data['daily_return'], bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.title('Distribution of Daily Returns')
plt.xlabel('Daily Return (%)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(returns_data['Date'], returns_data['daily_return'], alpha=0.7)
plt.title('Daily Returns Over Time')
plt.xlabel('Date')
plt.ylabel('Daily Return (%)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Step 7: Generate final report
print("Generating comprehensive report...")
analyzer.generate_report()

In [None]:
# Save the merged dataset for future use
print("Saving results...")
merged_data.to_csv('../data/processed/sentiment_returns_merged.csv', index=False)
print("Merged data saved to '../data/processed/sentiment_returns_merged.csv'")

# Save correlation results
correlation_df = pd.DataFrame([analyzer.correlation_results])
correlation_df.to_csv('../results/task3_correlation_results.csv', index=False)
print("Correlation results saved to '../results/task3_correlation_results.csv'")

In [None]:
# Advanced analysis: Rolling correlation
print("Calculating rolling correlation...")

# Sort by date
merged_sorted = merged_data.sort_values('publication_date').reset_index(drop=True)

# Calculate 30-day rolling correlation
window_size = 30
rolling_corr = []

for i in range(len(merged_sorted) - window_size + 1):
    window_data = merged_sorted.iloc[i:i + window_size]
    if len(window_data) >= window_size:
        corr, _ = pearsonr(window_data['avg_sentiment'], window_data['daily_return'])
        rolling_corr.append(corr)
    else:
        rolling_corr.append(np.nan)

# Add to dataframe
merged_sorted['rolling_correlation'] = [np.nan] * (window_size - 1) + rolling_corr

# Plot rolling correlation
plt.figure(figsize=(12, 6))
plt.plot(merged_sorted['publication_date'], merged_sorted['rolling_correlation'], 
         linewidth=2, color='purple', alpha=0.8)
plt.axhline(y=0, color='red', linestyle='--', alpha=0.5)
plt.xlabel('Date')
plt.ylabel(f'Rolling Correlation ({window_size}-day window)')
plt.title('Rolling Correlation between News Sentiment and Stock Returns')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"Average rolling correlation: {np.nanmean(rolling_corr):.4f}")

In [None]:
# Summary and insights
print("\n" + "="*70)
print("TASK 3 COMPLETION SUMMARY")
print("="*70)

print(f"\nüìä DATA OVERVIEW:")
print(f"   ‚Ä¢ News articles analyzed: {len(analyzer.news_data):,}")
print(f"   ‚Ä¢ Stock trading days: {len(analyzer.stock_data):,}")
print(f"   ‚Ä¢ Matching days for analysis: {len(merged_data):,}")

print(f"\nüîç SENTIMENT ANALYSIS:")
print(f"   ‚Ä¢ Average daily sentiment: {sentiment_data['avg_sentiment'].mean():.3f}")
print(f"   ‚Ä¢ Sentiment volatility (std): {sentiment_data['avg_sentiment'].std():.3f}")

print(f"\nüìà STOCK PERFORMANCE:")
print(f"   ‚Ä¢ Average daily return: {returns_data['daily_return'].mean():.3f}%")
print(f"   ‚Ä¢ Return volatility (std): {returns_data['daily_return'].std():.3f}%")

print(f"\nüìä CORRELATION RESULTS:")
corr_strength = abs(analyzer.correlation_results['daily_return_correlation'])
if corr_strength < 0.1:
    strength = "negligible"
elif corr_strength < 0.3:
    strength = "weak"
elif corr_strength < 0.5:
    strength = "moderate"
else:
    strength = "strong"

print(f"   ‚Ä¢ Correlation strength: {strength} ({analyzer.correlation_results['daily_return_correlation']:.4f})")
print(f"   ‚Ä¢ Statistical significance: {'YES' if analyzer.correlation_results['daily_return_p_value'] < 0.05 else 'NO'}")

print(f"\nüí° KEY INSIGHTS:")
if analyzer.correlation_results['daily_return_p_value'] < 0.05:
    if analyzer.correlation_results['daily_return_correlation'] > 0:
        print("   ‚Ä¢ Positive news sentiment tends to correlate with higher stock returns")
    else:
        print("   ‚Ä¢ Positive news sentiment tends to correlate with lower stock returns")
else:
    print("   ‚Ä¢ No statistically significant relationship found between sentiment and returns")

print(f"\nüéØ RECOMMENDATIONS:")
print("   ‚Ä¢ Consider incorporating sentiment analysis in trading strategies")
print("   ‚Ä¢ Monitor specific publisher sentiment for more targeted insights")
print("   ‚Ä¢ Expand analysis to include other technical indicators")
print("   ‚Ä¢ Consider lagged effects for predictive modeling")

print("\n" + "="*70)
print("Task 3: Correlation Analysis - COMPLETED SUCCESSFULLY!")
print("="*70)

In [None]:
# Section 9: Comprehensive Visualization
print("Generating Comprehensive Visualizations...")

# Generate all visualizations
analyzer.visualize_correlation(merged_data, save_path='../results/task3_correlation_analysis.png')

# Additional custom visualizations
print("\nGenerating Additional Custom Visualizations...")
# 1. Rolling correlation over time
window_size = 30  # 30-day rolling window
merged_data_sorted = merged_data.sort_values('publication_date').copy()
merged_data_sorted['rolling_corr'] = merged_data_sorted['avg_sentiment'].rolling(window=window_size).corr(merged_data_sorted['daily_return'])



In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
plt.plot(merged_data_sorted['publication_date'], merged_data_sorted['rolling_corr'], 
         color='purple', linewidth=2)
plt.axhline(y=0, color='red', linestyle='--', alpha=0.5)
plt.title(f'{window_size}-Day Rolling Correlation Between Sentiment and Returns')
plt.ylabel('Rolling Correlation')
plt.grid(True, alpha=0.3)

plt.subplot(2, 1, 2)
plt.plot(merged_data_sorted['publication_date'], merged_data_sorted['avg_sentiment'], 
         label='Sentiment', alpha=0.7)
plt.plot(merged_data_sorted['publication_date'], merged_data_sorted['daily_return'], 
         label='Returns', alpha=0.7)
plt.title('Sentiment and Returns Over Time')
plt.xlabel('Date')
plt.ylabel('Values')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 2. Sentiment vs Returns by sentiment categories
merged_data['sentiment_category'] = pd.cut(merged_data['avg_sentiment'], 
                                          bins=[-1, -0.1, 0.1, 1], 
                                          labels=['Negative', 'Neutral', 'Positive'])

plt.figure(figsize=(10, 6))
sns.boxplot(data=merged_data, x='sentiment_category', y='daily_return')
plt.title('Daily Returns by Sentiment Category')
plt.xlabel('Sentiment Category')
plt.ylabel('Daily Return (%)')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# 3. Scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=merged_data, x='avg_sentiment', y='daily_return', 
            scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title(f'Sentiment vs Daily Returns (Correlation: {correlation_results["daily_return_correlation"]:.3f})')
plt.xlabel('Average Daily Sentiment')
plt.ylabel('Daily Return (%)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Section 10: Advanced Analysis - Lagged Effects
print("Analyzing Lagged Effects...")

# Create lagged variables for analysis
lags = range(0, 6)  
lag_correlations = []

for lag in lags:
    if lag == 0:
        # Same day correlation
        corr, p_val = pearsonr(merged_data['avg_sentiment'], merged_data['daily_return'])
    else:
        # Create lagged returns (sentiment today vs returns in future)
        temp_data = merged_data.copy()
        temp_data[f'return_lag_{lag}'] = temp_data['daily_return'].shift(-lag)
        temp_data_lagged = temp_data.dropna()
        
        if len(temp_data_lagged) > 0:
            corr, p_val = pearsonr(temp_data_lagged['avg_sentiment'], 
                                 temp_data_lagged[f'return_lag_{lag}'])
        else:
            corr, p_val = (np.nan, np.nan)
    
    lag_correlations.append({
        'lag_days': lag,
        'correlation': corr,
        'p_value': p_val,
        'significant': p_val < 0.05 if not np.isnan(p_val) else False
    })


In [None]:
# Convert to DataFrame
lag_results = pd.DataFrame(lag_correlations)

print("Lagged Correlation Analysis:")
display(lag_results)

# Plot lagged correlations
plt.figure(figsize=(10, 6))
bars = plt.bar(lag_results['lag_days'], lag_results['correlation'], 
               color=['red' if sig else 'blue' for sig in lag_results['significant']],
               alpha=0.7)

plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)

plt.title('Correlation Between Sentiment and Future Returns (Lagged Analysis)')
plt.xlabel('Lag (Days)')
plt.ylabel('Correlation Coefficient')
plt.xticks(lags)
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, corr in zip(bars, lag_results['correlation']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 * (1 if bar.get_height() > 0 else -1),
             f'{corr:.3f}', ha='center', va='bottom' if bar.get_height() > 0 else 'top')

plt.tight_layout()
plt.show()

In [None]:

# Section 11: Save Results and Generate Final Report
print("Saving Results and Generating Final Report...")

# Save the merged dataset
merged_data.to_csv('../data/processed/sentiment_returns_merged.csv', index=False)
print("Saved merged dataset to: ../data/processed/sentiment_returns_merged.csv")

# Save correlation results
correlation_df = pd.DataFrame([correlation_results])
correlation_df.to_csv('../results/task3_correlation_results.csv', index=False)
print("Saved correlation results to: ../results/task3_correlation_results.csv")

# Save lag analysis results
lag_results.to_csv('../results/task3_lag_analysis.csv', index=False)
print("Saved lag analysis results to: ../results/task3_lag_analysis.csv")

# Generate final comprehensive report
print("\n" + "="*60)
print("TASK 3: FINAL CORRELATION ANALYSIS REPORT")
print("="*60)

print(f"\nDATA OVERVIEW:")
print(f"- News articles analyzed: {len(analyzer.news_data)}")
print(f"- Trading days analyzed: {len(analyzer.stock_data)}")
print(f"- Matching days with both sentiment and returns: {len(merged_data)}")
print(f"- Analysis period: {merged_data['publication_date'].min().strftime('%Y-%m-%d')} to {merged_data['publication_date'].max().strftime('%Y-%m-%d')}")

print(f"\nKEY FINDINGS:")
print(f"1. Primary Correlation (Same Day): {correlation_results['daily_return_correlation']:.4f}")
print(f"   Statistical Significance: {'Yes' if correlation_results['daily_return_p_value'] < 0.05 else 'No'}")

if not np.isnan(correlation_results['lagged_correlation']):
    print(f"2. Predictive Correlation (Next Day): {correlation_results['lagged_correlation']:.4f}")
    print(f"   Statistical Significance: {'Yes' if correlation_results['lagged_p_value'] < 0.05 else 'No'}")

print(f"\nINTERPRETATION:")
if abs(correlation_results['daily_return_correlation']) > 0.3 and correlation_results['daily_return_p_value'] < 0.05:
    print("‚úÖ STRONG EVIDENCE of relationship between news sentiment and stock returns")
elif abs(correlation_results['daily_return_correlation']) > 0.1 and correlation_results['daily_return_p_value'] < 0.05:
    print("‚ö†Ô∏è MODERATE EVIDENCE of relationship between news sentiment and stock returns")
else:
    print("‚ùï WEAK or NO EVIDENCE of relationship between news sentiment and stock returns")

print(f"\nRECOMMENDATIONS:")
if correlation_results['daily_return_p_value'] < 0.05:
    if correlation_results['daily_return_correlation'] > 0:
        print("- Positive news sentiment tends to correlate with positive stock returns")
        print("- Consider incorporating sentiment analysis in trading strategies")
    else:
        print("- Negative news sentiment tends to correlate with negative stock returns")
        print("- Sentiment could be used as a contrarian indicator")
else:
    print("- No statistically significant relationship found")
    print("- News sentiment may not be a reliable indicator for this dataset/time period")

print(f"\nFILES GENERATED:")
print("1. ../data/processed/sentiment_returns_merged.csv - Merged dataset")
print("2. ../results/task3_correlation_results.csv - Correlation coefficients")
print("3. ../results/task3_lag_analysis.csv - Lagged analysis results")
print("4. ../results/task3_correlation_analysis.png - Comprehensive visualizations")

print(f"\nTask 3 Correlation Analysis Completed Successfully! üéâ")