In [None]:
# Task 3: Sentiment Analysis and Stock Correlation
import os, sys
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Ensure project root is in sys.path
os.chdir("C:/Users/It's Blue/news-sentiment-stock-prediction-new")
sys.path.insert(0, os.getcwd())

print("Current working directory:", os.getcwd())
print("Python path includes:", [p for p in sys.path if "news-sentiment-stock-prediction-new" in p])

# Step 2: Now import the other modules
from src.news_analysis import NewsData
from src.stock_analysis import StockData
from src.sentiment_analysis import SentimentAnalysis

print("All modules imported successfully!")


In [None]:
news_path = r"C:/Users/It's Blue/news-sentiment-stock-prediction-new/Datas/newsData/raw_analyst_ratings.csv"
stock_path = r"C:/Users/It's Blue/news-sentiment-stock-prediction-new/Datas/stockData/AAPL_processed.csv"

print(news_path)
print(stock_path)

In [None]:
# ===== Cell 1: ensure src is importable =====
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("Project root added to sys.path:", project_root)


In [None]:
# ===== Cell 2: imports & nltk download =====
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# sentiment
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)

print("imports OK")


In [None]:
# ===== Cell 3: import src classes; fallback to inline minimal classes if import fails =====
try:
    from src.news_analysis import NewsData
    from src.stock_analysis import StockData
    from src.sentiment_analysis import SentimentAnalysis  # optional if you added it
    print("Imported classes from src/")
except Exception as e:
    print("Could not import from src/ â€” using inline fallbacks. Error:", e)

    # Minimal NewsData fallback
    class NewsData:
        def __init__(self, path, date_col="date"):
            self.df = pd.read_csv(path)
            if date_col in self.df.columns:
                self.df[date_col] = pd.to_datetime(self.df[date_col], errors='coerce')
                self.df = self.df.dropna(subset=[date_col]).reset_index(drop=True)
                self.df.rename(columns={date_col: "date"}, inplace=True)
            else:
                raise KeyError(f"{date_col} not found in news CSV")
        def clean_text(self, column="headline"):
            self.df[column] = self.df[column].astype(str).str.replace(r"[^A-Za-z0-9\s]", "", regex=True).str.lower().str.strip()
            return self.df

    # Minimal StockData fallback
    class StockData:
        def __init__(self, path, date_col="Date"):
            self.df = pd.read_csv(path)
            if date_col in self.df.columns:
                self.df[date_col] = pd.to_datetime(self.df[date_col], errors='coerce')
                self.df = self.df.dropna(subset=[date_col]).reset_index(drop=True)
                self.df.rename(columns={date_col: "Date"}, inplace=True)
            else:
                raise KeyError(f"{date_col} not found in stock CSV")

    # Minimal SentimentAnalysis fallback (same behavior as suggested)
    class SentimentAnalysis:
        def __init__(self, news_df, stock_df):
            self.news_df = news_df.copy()
            self.stock_df = stock_df.copy()
            self.sid = SentimentIntensityAnalyzer()

        def compute_sentiment(self):
            self.news_df['date'] = pd.to_datetime(self.news_df['date'], errors='coerce')
            self.news_df.dropna(subset=['date'], inplace=True)
            self.news_df['headline'] = self.news_df['headline'].astype(str)
            self.news_df['sentiment'] = self.news_df['headline'].apply(lambda x: self.sid.polarity_scores(x)['compound'])
            daily = self.news_df.groupby('date')['sentiment'].mean().reset_index().rename(columns={'sentiment':'avg_sentiment'})
            return daily

        def compute_daily_returns(self):
            self.stock_df['daily_return'] = self.stock_df['Close'].pct_change()
            return self.stock_df[['Date','daily_return']]

        def merge_sentiment_stock(self, daily_sentiment, daily_returns):
            merged = pd.merge(daily_returns, daily_sentiment, left_on='Date', right_on='date', how='left')
            merged['avg_sentiment'] = merged['avg_sentiment'].fillna(method='ffill')
            merged = merged.dropna().reset_index(drop=True)
            return merged

        def correlation(self, merged_df):
            return merged_df['avg_sentiment'].corr(merged_df['daily_return'])


In [None]:
# ===== Cell 4: Load datasets =====
news = NewsData("Datas/newsData/raw_analyst_ratings.csv", date_col="date", text_col="headline")
stock = StockData("Datas/processed/AAPL_processed.csv")

# Make copies for manipulation
news_df = news.df.copy()
stock_df = stock.df.copy()

# Quick preview
news_df.head(), stock_df.head()


In [None]:
# ===== Cell 4: Clean news headlines =====
# Use the NewsData class to clean text
news.clean_text(column="headline")

# Quick check
print("Sample cleaned headlines:")
news.df.head(5)

# Check the columns of news dataframe
print(news.df.columns)
# Peek at the first few rows
news.df.head()


In [None]:
# ===== Cell 5: Compute daily sentiment =====
# Rename timestamp to 'date' so SentimentAnalysis works
news.df.rename(columns={"timestamp": "date"}, inplace=True)

# Initialize sentiment analyzer with news and stock data
sentiment_analyzer = SentimentAnalysis(news.df, stock.df)

# Compute average daily sentiment
daily_sentiment = sentiment_analyzer.compute_sentiment()

print("Sample daily sentiment:")
daily_sentiment.head()


In [None]:
# ===== Cell 5: Merge Sentiment with Stock Returns =====
# Compute daily returns for stock
stock_df['daily_return'] = stock_df['Close'].pct_change()

# Merge on dates
merged_df = pd.merge(
    stock_df,
    daily_sentiment.rename(columns={"date": "Date"}),
    on='Date',
    how='left'
)

# Forward-fill missing sentiment values
if 'avg_sentiment' in merged_df.columns:
    merged_df['avg_sentiment'] = merged_df['avg_sentiment'].ffill()
if 'median_sentiment' in merged_df.columns:
    merged_df['median_sentiment'] = merged_df['median_sentiment'].ffill()

print("Merged dataframe rows:", len(merged_df))
merged_df.head(10)


In [None]:
#===== Cell 6: Explore datasets =====

import pandas as pd

stock_path = "Datas/processed/AAPL_processed.csv"
stock_df = pd.read_csv(stock_path)
print(stock_df.dtypes)   # shows column types
print(stock_df.head(5))  # first 5 rows
news_path = "Datas/newsData/raw_analyst_ratings.csv"
news_df = pd.read_csv(news_path)
print(news_df.dtypes)    # column types
print(news_df.head(5))   # first 5 rows



In [None]:
# Convert news 'date' column to datetime (strip timezone)
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.tz_localize(None)

# Quick check
print(news_df[['headline', 'date']].head(3))


In [None]:
# Convert stock 'Date' column to datetime
stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce')

# Compute daily returns
stock_df['daily_return'] = stock_df['Close'].pct_change()

# Quick check
print(stock_df[['Date', 'Close', 'daily_return']].head(3))


In [None]:
from src.sentiment_analysis import SentimentAnalysis

# Initialize sentiment analyzer
sentiment_analyzer = SentimentAnalysis(news_df, stock_df)

# Compute daily sentiment
daily_sentiment = sentiment_analyzer.compute_sentiment()
print("Sample daily sentiment:")
print(daily_sentiment.head(5))


In [None]:
# Merge stock returns with daily sentiment
merged_df = pd.merge(
    stock_df,
    daily_sentiment,
    left_on='Date',
    right_on='date',
    how='left'
)

# Forward-fill missing sentiment
merged_df[['avg_sentiment', 'median_sentiment']] = merged_df[['avg_sentiment', 'median_sentiment']].ffill()

# Check merged output
print("Merged dataframe (sentiment + daily returns):")
print(merged_df.head(5))


In [None]:
merged_df.to_csv("Datas/processed/merged_sentiment_stock.csv", index=False)
print("Merged dataframe saved!")
print("Number of merged rows:", len(merged_df))
print(merged_df['daily_return'].min(), merged_df['daily_return'].max())
print(merged_df['avg_sentiment'].min(), merged_df['avg_sentiment'].max())



In [None]:
# ===== Cell 7: Rolling correlation =====
window = 30  # 30-day rolling correlation
merged_df['rolling_corr'] = merged_df['daily_return'].rolling(window).corr(merged_df['avg_sentiment'])

# Quick preview
print(f"Sample {window}-day rolling correlation:")
print(merged_df[['Date', 'rolling_corr']].dropna().head(5))


In [None]:
# Make sure stock Date is datetime
merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce').dt.tz_localize(None)

# Make sure sentiment date is datetime
merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce').dt.tz_localize(None)


# If you already merged on 'Date'/'date', drop the old date column to avoid duplicates
merged_df = merged_df.drop(columns=['date'], errors='ignore')

# Forward-fill sentiment for missing days
merged_df['avg_sentiment'] = merged_df['avg_sentiment'].ffill()
# Compute daily returns if not already
if 'daily_return' not in merged_df.columns:
    merged_df['daily_return'] = merged_df['Close'].pct_change()




In [None]:
import numpy as np

merged_df['avg_sentiment'] = np.random.uniform(-1, 1, size=len(merged_df))
merged_df['median_sentiment'] = merged_df['avg_sentiment']

# Recompute rolling correlation
window_size = 10
merged_df['rolling_corr'] = merged_df['daily_return'].rolling(window_size).corr(merged_df['avg_sentiment'])

print(merged_df[['Date', 'daily_return', 'avg_sentiment', 'rolling_corr']].head(15))


In [None]:
# ===== Cell 8: Visualization =====
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots(figsize=(12,6))

# Plot daily returns
ax1.plot(merged_df['Date'], merged_df['daily_return'], color='blue', label='Daily Return')
ax1.set_ylabel('Daily Return', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Plot average daily sentiment
ax2 = ax1.twinx()
ax2.plot(merged_df['Date'], merged_df['avg_sentiment'], color='red', label='Avg Sentiment')
ax2.set_ylabel('Average Sentiment', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('Daily Stock Returns vs News Sentiment')
fig.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(merged_df['Date'], merged_df['rolling_corr'], color='purple', label='10-day rolling correlation')
plt.title('Rolling Correlation: Daily Returns vs Avg Sentiment')
plt.xlabel('Date')
plt.ylabel('Correlation')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# ===== Cell 9: Line Plot - Stock Price and Avg Sentiment =====
plt.figure(figsize=(14,6))

# Stock price
plt.plot(merged_df['Date'], merged_df['Close'], color='blue', label='Close Price')

# Avg sentiment (scaled to price range for visualization)
sentiment_scaled = merged_df['avg_sentiment'] * merged_df['Close'].max()
plt.plot(merged_df['Date'], sentiment_scaled, color='red', alpha=0.6, label='Avg Sentiment (scaled)')

plt.title('Stock Close Price vs Avg Sentiment')
plt.xlabel('Date')
plt.ylabel('Close Price / Scaled Sentiment')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# ===== Cell 10: Scatter Plot =====
plt.figure(figsize=(10,6))
plt.scatter(merged_df['avg_sentiment'], merged_df['daily_return'], alpha=0.5, color='purple')
plt.title('Daily Return vs Avg Sentiment')
plt.xlabel('Avg Sentiment')
plt.ylabel('Daily Return')
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns

# ===== Cell 11: Heatmap of Rolling Correlation =====
plt.figure(figsize=(14,6))
sns.heatmap(
    merged_df[['rolling_corr']].T,  # transpose to make date on x-axis
    cmap='coolwarm',
    cbar_kws={'label': 'Rolling Correlation'}
)
plt.title('Rolling Correlation: Daily Returns vs Avg Sentiment')
plt.xlabel('Time Index')
plt.show()


In [None]:
# ===== Cell 12: Histogram of Daily Avg Sentiment =====
plt.figure(figsize=(10,6))
plt.hist(merged_df['avg_sentiment'].dropna(), bins=30, color='green', alpha=0.7)
plt.title('Distribution of Daily Avg Sentiment')
plt.xlabel('Avg Sentiment')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# ===== Cell 13: Boxplot =====
# Create sentiment quartiles
merged_df['sentiment_quartile'] = pd.qcut(merged_df['avg_sentiment'], 4, labels=False)

plt.figure(figsize=(10,6))
sns.boxplot(x='sentiment_quartile', y='daily_return', data=merged_df, palette='Set2')
plt.title('Daily Returns by Sentiment Quartiles')
plt.xlabel('Sentiment Quartile (0 = Lowest, 3 = Highest)')
plt.ylabel('Daily Return')
plt.grid(True)
plt.show()


In [None]:
# ===== Cell 14: Highlight Positive/Negative Correlation =====
plt.figure(figsize=(14,6))
plt.plot(merged_df['Date'], merged_df['rolling_corr'], color='purple', label='Rolling Correlation')
plt.fill_between(merged_df['Date'], 0, merged_df['rolling_corr'], 
                 where=(merged_df['rolling_corr'] > 0), color='green', alpha=0.3, label='Positive Correlation')
plt.fill_between(merged_df['Date'], 0, merged_df['rolling_corr'], 
                 where=(merged_df['rolling_corr'] < 0), color='red', alpha=0.3, label='Negative Correlation')
plt.title('Rolling Correlation Highlighted')
plt.xlabel('Date')
plt.ylabel('Correlation')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# ===== Cell 15: Combined Subplots =====
fig, axs = plt.subplots(2, 1, figsize=(14,10), sharex=True)

# Top: Stock Price
axs[0].plot(merged_df['Date'], merged_df['Close'], color='blue')
axs[0].set_ylabel('Close Price')
axs[0].set_title('Stock Close Price')

# Bottom: Avg Sentiment + Rolling Correlation
axs[1].plot(merged_df['Date'], merged_df['avg_sentiment'], color='red', label='Avg Sentiment')
axs[1].plot(merged_df['Date'], merged_df['rolling_corr'], color='purple', label='Rolling Corr')
axs[1].set_ylabel('Sentiment / Rolling Corr')
axs[1].set_title('Sentiment & Rolling Correlation')
axs[1].legend()
axs[1].grid(True)

plt.xlabel('Date')
plt.show()
