In [1]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from scipy.stats import pearsonr

print("Correlation Analysis - Implementing Assessment Criteria")

Correlation Analysis - Implementing Assessment Criteria


In [2]:
# Load and prepare data for correlation analysis
print("Loading news and stock data...")

# Load news data with correct columns
news_df = pd.read_csv(r'C:\Users\admin\finaniacal-week1\data\raw_analyst_ratings.csv')
print(f"News data loaded: {len(news_df)} records")
print("News columns:", news_df.columns.tolist())

# Load stock data with correct columns  
stock_df = pd.read_csv(r'C:\Users\admin\finaniacal-week1\data\NVDA.csv')
print(f"Stock data loaded: {len(stock_df)} records")
print("Stock columns:", stock_df.columns.tolist())

# Display sample data
print("\nNews data sample:")
display(news_df[['headline', 'date', 'publisher']].head())

print("\nStock data sample:")
display(stock_df[['Date', 'Close']].head())

Loading news and stock data...
News data loaded: 1407328 records
News columns: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Stock data loaded: 3774 records
Stock columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

News data sample:


Unnamed: 0,headline,date,publisher
0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:54-04:00,Benzinga Insights
1,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:20-04:00,Benzinga Insights
2,71 Biggest Movers From Friday,2020-05-26 04:30:07-04:00,Lisa Levin
3,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:06-04:00,Lisa Levin
4,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:59-04:00,Vick Meyer



Stock data sample:


Unnamed: 0,Date,Close
0,2009-01-02,0.199652
1,2009-01-05,0.203319
2,2009-01-06,0.210196
3,2009-01-07,0.197589
4,2009-01-08,0.192546


In [3]:
print("Normalizing dates between news and stock data...")

# Force to string
news_df['date'] = news_df['date'].astype(str)

# Convert to datetime
news_df['date_normalized'] = pd.to_datetime(
    news_df['date'],
    errors='coerce'
)

# Normalize
news_df['date_normalized'] = news_df['date_normalized'].dt.floor("D")


# STOCK DATA 
stock_df['Date'] = stock_df['Date'].astype(str)

stock_df['date_normalized'] = pd.to_datetime(
    stock_df['Date'],
    errors='coerce'
)

stock_df['date_normalized'] = stock_df['date_normalized'].dt.floor("D")


print("Date normalization completed:")
print(f"News date range: {news_df['date_normalized'].min()} to {news_df['date_normalized'].max()}")
print(f"Stock date range: {stock_df['date_normalized'].min()} to {stock_df['date_normalized'].max()}")


Normalizing dates between news and stock data...
Date normalization completed:
News date range: 2011-04-27 00:00:00-04:00 to 2020-06-11 00:00:00-04:00
Stock date range: 2009-01-02 00:00:00 to 2023-12-29 00:00:00


In [None]:
#Performing sentiment analysis on news headlines
print("Performing sentiment analysis on news headlines...")

def calculate_sentiment(text):
    """Calculate sentiment polarity using TextBlob"""
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0.0

# Apply sentiment analysis to each headline
news_df['sentiment'] = news_df['headline'].apply(calculate_sentiment)

print("Sentiment analysis completed")
print(f"Sentiment statistics:")
print(f"  Average sentiment: {news_df['sentiment'].mean():.4f}")
print(f"  Min sentiment: {news_df['sentiment'].min():.4f}")
print(f"  Max sentiment: {news_df['sentiment'].max():.4f}")
print(f"  Std sentiment: {news_df['sentiment'].std():.4f}")

# Display headlines with sentiment scores
print("\nSample headlines with sentiment scores:")
sample_news = news_df[['headline', 'sentiment']].head(10)
display(sample_news)

Performing sentiment analysis on news headlines...


In [None]:
# Calculate daily average sentiment
print("Calculating daily average sentiment...")

daily_sentiment = news_df.groupby('date_normalized')['sentiment'].agg([
    ('avg_sentiment', 'mean'),
    ('article_count', 'count')
]).reset_index()

print(f"Daily sentiment calculated for {len(daily_sentiment)} days")
print("\nDaily sentiment sample:")
display(daily_sentiment.head(10))

In [None]:
#Computing daily returns
print("Computing daily returns for stock data...")

# Sort stock data by date
stock_df = stock_df.sort_values('date_normalized')

# Calculate daily percentage returns
stock_df['daily_return'] = stock_df['Close'].pct_change() * 100

# Remove first row with NaN return
stock_returns = stock_df.dropna(subset=['daily_return'])

print("Daily returns computation completed")
print(f"Returns calculated for {len(stock_returns)} trading days")
print(f"Returns statistics:")
print(f"  Average daily return: {stock_returns['daily_return'].mean():.4f}%")
print(f"  Min return: {stock_returns['daily_return'].min():.4f}%")
print(f"  Max return: {stock_returns['daily_return'].max():.4f}%")
print(f"  Std return: {stock_returns['daily_return'].std():.4f}%")

print("\nStock returns sample:")
display(stock_returns[['date_normalized', 'Close', 'daily_return']].head(10))

In [None]:
# Merge sentiment and returns data for correlation analysis
print("Merging sentiment and returns data...")

# Inner join on normalized dates
merged_data = pd.merge(
    daily_sentiment,
    stock_returns[['date_normalized', 'daily_return', 'Close']],
    on='date_normalized',
    how='inner'
)

print(f"Merged dataset created: {len(merged_data)} matching days")
print(f"Date range in merged data: {merged_data['date_normalized'].min()} to {merged_data['date_normalized'].max()}")

print("\nMerged data sample:")
display(merged_data.head(10))

In [None]:
#Calculating Pearson correlation coefficient
print("Calculating Pearson correlation coefficient...")

# Calculate Pearson correlation between average sentiment and daily returns
correlation, p_value = pearsonr(
    merged_data['avg_sentiment'], 
    merged_data['daily_return']
)

print("PEARSON CORRELATION RESULTS:")
print(f"Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Sample Size: {len(merged_data)} days")

# Interpret correlation strength
corr_strength = abs(correlation)
if corr_strength < 0.1:
    strength = "negligible"
elif corr_strength < 0.3:
    strength = "weak"
elif corr_strength < 0.5:
    strength = "moderate"
else:
    strength = "strong"

print(f"Correlation Strength: {strength}")

# Interpret statistical significance
if p_value < 0.05:
    print("Statistical Significance: SIGNIFICANT (p < 0.05)")
else:
    print("Statistical Significance: NOT SIGNIFICANT (p >= 0.05)")

In [None]:
# Create visualization to show correlation
print("Creating correlation visualization...")

plt.figure(figsize=(10, 6))

# Scatter plot
plt.scatter(merged_data['avg_sentiment'], merged_data['daily_return'], 
           alpha=0.6, s=50, color='blue')

# Add trend line
z = np.polyfit(merged_data['avg_sentiment'], merged_data['daily_return'], 1)
p = np.poly1d(z)
plt.plot(merged_data['avg_sentiment'], p(merged_data['avg_sentiment']), 
         "r--", linewidth=2, alpha=0.8, label='Trend line')

# Plot formatting
plt.xlabel('Average Daily Sentiment', fontsize=12)
plt.ylabel('Daily Returns (%)', fontsize=12)
plt.title(f'Correlation between News Sentiment and Stock Returns\nPearson r = {correlation:.4f} (p = {p_value:.4f})', 
          fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend()

# Add correlation annotation
plt.annotate(f'r = {correlation:.4f}\np = {p_value:.4f}\nn = {len(merged_data)}', 
             xy=(0.05, 0.95), xycoords='axes fraction',
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
             fontsize=10, ha='left', va='top')

plt.tight_layout()
plt.show()

print("Correlation visualization completed")

In [None]:
# Save results and visualization
print("Saving results...")

# Save merged data
merged_data.to_csv('../data/correlation_results.csv', index=False)
print("‚úì Merged data saved to: ../data/correlation_results.csv")

# Save visualization
plt.savefig('../results/correlation_analysis.png', dpi=300, bbox_inches='tight')
print("‚úì Visualization saved to: ../results/correlation_analysis.png")

# Save correlation results summary
correlation_summary = pd.DataFrame({
    'metric': ['pearson_correlation', 'p_value', 'sample_size', 'date_range_start', 'date_range_end'],
    'value': [correlation, p_value, len(merged_data), 
              merged_data['date_normalized'].min(), 
              merged_data['date_normalized'].max()]
})
correlation_summary.to_csv('../results/correlation_summary.csv', index=False)
print("‚úì Correlation summary saved to: ../results/correlation_summary.csv")

In [None]:
# Completion Report
print("\n" + "="*70)
print("CORRELATION ANALYSIS - COMPLETION REPORT")
print("="*70)

print("\n‚úÖ CRITERIA IMPLEMENTATION STATUS:")
print("‚úì Normalizing dates between news and stock data - COMPLETED")
print("‚úì Performing sentiment analysis on news headlines - COMPLETED") 
print("‚úì Computing daily returns - COMPLETED")
print("‚úì Calculating Pearson correlation coefficient - COMPLETED")

print(f"\nüìä ANALYSIS RESULTS:")
print(f"   ‚Ä¢ News articles analyzed: {len(news_df):,}")
print(f"   ‚Ä¢ Stock trading days: {len(stock_df):,}")
print(f"   ‚Ä¢ Matching days for correlation: {len(merged_data):,}")
print(f"   ‚Ä¢ Pearson Correlation Coefficient: {correlation:.4f}")
print(f"   ‚Ä¢ P-value: {p_value:.4f}")

print(f"\nüìà SENTIMENT ANALYSIS:")
print(f"   ‚Ä¢ Average headline sentiment: {news_df['sentiment'].mean():.4f}")
print(f"   ‚Ä¢ Daily average sentiment range: {daily_sentiment['avg_sentiment'].min():.4f} to {daily_sentiment['avg_sentiment'].max():.4f}")

print(f"\nüíπ STOCK RETURNS:")
print(f"   ‚Ä¢ Average daily return: {stock_returns['daily_return'].mean():.4f}%")
print(f"   ‚Ä¢ Returns range: {stock_returns['daily_return'].min():.4f}% to {stock_returns['daily_return'].max():.4f}%")

print(f"\nüîç CORRELATION INTERPRETATION:")
if p_value < 0.05:
    if correlation > 0:
        print("   ‚Ä¢ Statistically significant POSITIVE correlation found")
        print("   ‚Ä¢ Higher news sentiment tends to associate with higher stock returns")
    else:
        print("   ‚Ä¢ Statistically significant NEGATIVE correlation found")
        print("   ‚Ä¢ Higher news sentiment tends to associate with lower stock returns")
else:
    print("   ‚Ä¢ No statistically significant correlation found")
    print("   ‚Ä¢ News sentiment and stock returns show no clear relationship")

print(f"\nüíæ OUTPUT FILES GENERATED:")
print("   ‚úì ../data/correlation_results.csv")
print("   ‚úì ../results/correlation_analysis.png") 
print("   ‚úì ../results/correlation_summary.csv")

print("\n" + "="*70)
print("SUCCESSFULLY COMPLETED")
print("="*70)