In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from textblob import TextBlob

# Print current working directory for debugging
print("Current Working Directory:", os.getcwd())

# --- 1. Load and Prepare Data ---
# Load processed news data from Task 1
news_file = '../data/processed_analyst_ratings.csv'
try:
    news_data = pd.read_csv(news_file)
    print(f"Loaded news data successfully. Shape:", news_data.shape)
except FileNotFoundError:
    raise FileNotFoundError(f"News data file not found: {news_file}")

# Load processed stock data from Task 2
stock_file = '../data/processed_AAPL_historical_data.csv'
try:
    stock_data = pd.read_csv(stock_file)
    print(f"Loaded stock data successfully. Shape:", stock_data.shape)
except FileNotFoundError:
    raise FileNotFoundError(f"Stock data file not found: {stock_file}")

# Ensure date columns are in datetime format
news_data['date'] = pd.to_datetime(news_data['date'])
stock_data['Date'] = pd.to_datetime(stock_data['Date'])  # Convert Date column to datetime

# --- 2. Data Cleaning and Alignment ---
# Aggregate sentiment by date (e.g., average sentiment per day)
news_data['date_only'] = news_data['date'].dt.date
daily_sentiment = news_data.groupby('date_only')['sentiment'].mean().reset_index()
daily_sentiment.columns = ['Date', 'Avg_Sentiment']  # Rename columns first
daily_sentiment['Date'] = pd.to_datetime(daily_sentiment['Date'])  # Then convert to datetime64[ns]

# Merge with stock data on Date
merged_data = pd.merge(daily_sentiment, stock_data, on='Date', how='inner')
print("\nMerged data shape:", merged_data.shape)
print("Merged data columns:", merged_data.columns)

# Handle missing values
print("\nMissing values in merged data:")
print(merged_data.isnull().sum())
merged_data = merged_data.dropna()

# --- 3. Correlation Analysis ---
# Calculate Pearson correlation between Avg_Sentiment and Daily_Return
correlation = merged_data['Avg_Sentiment'].corr(merged_data['Daily_Return'])
print(f"\nPearson Correlation between Avg_Sentiment and Daily_Return: {correlation:.4f}")

# --- 4. Visualize the Data ---
# Ensure the 'reports' directory exists
os.makedirs('reports', exist_ok=True)

# Scatter plot of Sentiment vs Daily Return
plt.figure(figsize=(10, 6))
plt.scatter(merged_data['Avg_Sentiment'], merged_data['Daily_Return'], alpha=0.5)
plt.title('Sentiment vs Daily Return for AAPL')
plt.xlabel('Average Daily Sentiment')
plt.ylabel('Daily Return (%)')
plt.savefig('reports/sentiment_vs_daily_return.png')
plt.close()

# Heatmap of correlations
correlation_matrix = merged_data[['Avg_Sentiment', 'Daily_Return', 'Close', 'SMA_20', 'SMA_50', 'EMA_20', 'RSI', 'MACD']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap for AAPL Data')
plt.savefig('reports/correlation_heatmap.png')
plt.close()

# --- 5. Save Processed Data ---
# Ensure the 'data' directory exists
os.makedirs('data', exist_ok=True)
merged_data.to_csv('data/merged_sentiment_stock_data.csv', index=False)

print("\nTask 3 completed:")
print("- Merged news sentiment and stock data.")
print(f"- Correlation between sentiment and daily return: {correlation:.4f}")
print("- Visualizations saved to 'reports/'.")
print("- Merged data saved to 'data/merged_sentiment_stock_data.csv'.")

Current Working Directory: c:\Users\Simbo\Desktop\week1-challenge\notebooks
Loaded news data successfully. Shape: (1407328, 11)
Loaded stock data successfully. Shape: (10998, 17)

Merged data shape: (2757, 18)
Merged data columns: Index(['Date', 'Avg_Sentiment', 'Open', 'High', 'Low', 'Close', 'Adj Close',
       'Volume', 'Dividends', 'Stock Splits', 'SMA_20', 'SMA_50', 'EMA_20',
       'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Daily_Return'],
      dtype='object')

Missing values in merged data:
Date             0
Avg_Sentiment    0
Open             0
High             0
Low              0
Close            0
Adj Close        0
Volume           0
Dividends        0
Stock Splits     0
SMA_20           0
SMA_50           0
EMA_20           0
RSI              0
MACD             0
MACD_Signal      0
MACD_Hist        0
Daily_Return     0
dtype: int64

Pearson Correlation between Avg_Sentiment and Daily_Return: 0.0455

Task 3 completed:
- Merged news sentiment and stock data.
- Correlation