In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from scripts.news_sentiment import calculate_sentiment_scores
from scripts.correlation_analysis import align_datasets, calculate_pearson_correlation

# Load datasets
news_data = pd.read_csv("../data/raw_analyst_ratings/raw_analyst_ratings.csv")
stock_data = pd.read_csv("../data/processed_stocks_data.csv")

# Normalize and preprocess news data
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce').dt.tz_localize(None)

# Check for any parsing issues
if news_data['date'].isnull().any():
    print("Warning: Some date values could not be parsed and were set to NaT.")

# Calculate daily sentiment scores
daily_sentiment = calculate_sentiment_scores(news_data)

# Align news and stock datasets
aligned_data = align_datasets(stock_data, daily_sentiment)

# Calculate Pearson correlation
correlation = calculate_pearson_correlation(aligned_data)
print(f"Correlation between stock returns and sentiment scores: {correlation:.2f}")

# Visualization: Sentiment vs Stock Returns
plt.figure(figsize=(10, 6))
plt.scatter(aligned_data['average_sentiment'], aligned_data['Daily_Return'], alpha=0.6)
plt.title("Correlation Between Sentiment and Stock Returns")
plt.xlabel("Average Sentiment Score")
plt.ylabel("Daily Return (%)")
plt.grid(True)
plt.show()

# Visualization: Time Series Overlay
plt.figure(figsize=(12, 6))
plt.plot(aligned_data['Date'], aligned_data['Daily_Return'], label="Daily Return", color='blue')
plt.plot(aligned_data['Date'], aligned_data['average_sentiment'], label="Average Sentiment", color='orange')
plt.title("Time Series of Sentiment and Stock Returns")
plt.xlabel("Date")
plt.ylabel("Value")
plt.legend()
plt.grid(True)
plt.show()
