# Statistical Analysis: Sentiment vs QQQ Returns

This notebook performs deeper analysis on the relationship between sentiment and market returns.

In [None]:
import pandas as pd

import matplotlib.pyplot as plt


In [None]:

combined_df = pd.read_csv('data/combined_data_with_lr.csv')
combined_df['date'] = pd.to_datetime(combined_df['date'])


df_truth = pd.read_csv('data/truth_social_cleaned.csv')
df_truth['date'] = pd.to_datetime(df_truth['date'])

print(f"{len(combined_df)} days of market data")
print(f"\n{len(df_truth)} posts")
combined_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/combined_data_with_lr.csv'

In [None]:

combined_df['next_day_returns'] = combined_df['Returns'].shift(-1)
combined_df['two_day_returns'] = combined_df['Returns'].shift(-2)


print("Same-Day Correlations")
print(f"VADER: {combined_df['avg_sentiment'].corr(combined_df['Returns']):.4f}")
print(f"FinBERT: {combined_df['avg_finbert_sentiment'].corr(combined_df['Returns']):.4f}")
print(f"RoBERTa: {combined_df['avg_roberta_sentiment'].corr(combined_df['Returns']):.4f}")
print(f"LR: {combined_df['avg_lr_sentiment'].corr(combined_df['Returns']):.4f}")

print("Next-Day (t+1) Correlations")
print(f"VADER: {combined_df['avg_sentiment'].corr(combined_df['next_day_returns']):.4f}")
print(f"FinBERT: {combined_df['avg_finbert_sentiment'].corr(combined_df['next_day_returns']):.4f}")
print(f"RoBERTa: {combined_df['avg_roberta_sentiment'].corr(combined_df['next_day_returns']):.4f}")
print(f"LR: {combined_df['avg_lr_sentiment'].corr(combined_df['next_day_returns']):.4f}")

print("Two-Day (t+2) Correlations")
print(f"VADER: {combined_df['avg_sentiment'].corr(combined_df['two_day_returns']):.4f}")
print(f"FinBERT: {combined_df['avg_finbert_sentiment'].corr(combined_df['two_day_returns']):.4f}")
print(f"RoBERTa: {combined_df['avg_roberta_sentiment'].corr(combined_df['two_day_returns']):.4f}")
print(f"LR: {combined_df['avg_lr_sentiment'].corr(combined_df['two_day_returns']):.4f}")

In [None]:

df_with_market = pd.read_csv('data/truth_social_cleaned.csv')
df_with_market['date'] = pd.to_datetime(df_with_market['date'])


if 'is_market_related' not in df_with_market.columns:
    market_keywords = ['inflation', 'fed', 'economy', 'china', 'stock market', 'tariff', 
                       'trade', 'economic', 'recession', 'gdp', 'unemployment', 'dollar']
    df_with_market['is_market_related'] = df_with_market['cleaned_content'].apply(
        lambda x: any(keyword in str(x).lower() for keyword in market_keywords)
    )


df_with_sentiment = pd.read_csv('data/truth_social_with_lr.csv')
df_with_market['vader_sentiment'] = df_with_sentiment['vader_sentiment']


market_daily = df_with_market[df_with_market['is_market_related'] == True].groupby('date')['vader_sentiment'].mean().reset_index(name='market_sentiment')
nonmarket_daily = df_with_market[df_with_market['is_market_related'] == False].groupby('date')['vader_sentiment'].mean().reset_index(name='nonmarket_sentiment')


combined_df = pd.merge(combined_df, market_daily, on='date', how='left')
combined_df = pd.merge(combined_df, nonmarket_daily, on='date', how='left')
combined_df['market_sentiment'] = combined_df['market_sentiment'].fillna(0)
combined_df['nonmarket_sentiment'] = combined_df['nonmarket_sentiment'].fillna(0)


print("Market-Related Posts vs Returns")
print(f"Correlation: {combined_df['market_sentiment'].corr(combined_df['Returns']):.4f}")

print("\nNon-Market Posts vs Returns")
print(f"Correlation: {combined_df['nonmarket_sentiment'].corr(combined_df['Returns']):.4f}")

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0,0].scatter(combined_df['avg_sentiment'], combined_df['Returns'], alpha=0.5)
axes[0,0].set_xlabel('VADER Sentiment')
axes[0,0].set_ylabel('QQQ Returns (%)')
axes[0,0].set_title('VADER vs Returns')
axes[0,0].grid(True, alpha=0.3)

axes[0,1].scatter(combined_df['avg_finbert_sentiment'], combined_df['Returns'], alpha=0.5, color='orange')
axes[0,1].set_xlabel('FinBERT Sentiment')
axes[0,1].set_ylabel('QQQ Returns (%)')
axes[0,1].set_title('FinBERT vs Returns')
axes[0,1].grid(True, alpha=0.3)

axes[1,0].scatter(combined_df['avg_roberta_sentiment'], combined_df['Returns'], alpha=0.5, color='green')
axes[1,0].set_xlabel('RoBERTa Sentiment')
axes[1,0].set_ylabel('QQQ Returns (%)')
axes[1,0].set_title('RoBERTa vs Returns')
axes[1,0].grid(True, alpha=0.3)

axes[1,1].scatter(combined_df['avg_lr_sentiment'], combined_df['Returns'], alpha=0.5, color='purple')
axes[1,1].set_xlabel('LR Sentiment')
axes[1,1].set_ylabel('QQQ Returns (%)')
axes[1,1].set_title('Logistic Regression vs Returns')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))


ax1.plot(combined_df['date'], combined_df['avg_sentiment'], label='VADER', alpha=0.7)
ax1.plot(combined_df['date'], combined_df['avg_finbert_sentiment'], label='FinBERT', alpha=0.7)
ax1.plot(combined_df['date'], combined_df['avg_roberta_sentiment'], label='RoBERTa', alpha=0.7)
ax1.plot(combined_df['date'], combined_df['avg_lr_sentiment'], label='LR', alpha=0.7)
ax1.set_ylabel('Sentiment Score')
ax1.set_title('Sentiment Over Time')
ax1.legend()
ax1.grid(True, alpha=0.3)


ax2.plot(combined_df['date'], combined_df['Returns'], color='red', alpha=0.7)
ax2.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax2.set_xlabel('Date')
ax2.set_ylabel('QQQ Returns (%)')
ax2.set_title('QQQ Returns Over Time')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Summary Statistics

In [None]:

print("Summary")
print(f"\nSame-Day:")
print(f"  VADER: {combined_df['avg_sentiment'].corr(combined_df['Returns'])}")
print(f"  FinBERT: {combined_df['avg_finbert_sentiment'].corr(combined_df['Returns'])}")
print(f"  RoBERTa: {combined_df['avg_roberta_sentiment'].corr(combined_df['Returns'])}")
print(f"  LR: {combined_df['avg_lr_sentiment'].corr(combined_df['Returns'])}")

print(f"Next-Day:")
print(f"VADER: {combined_df['avg_sentiment'].corr(combined_df['next_day_returns'])}")
print(f"FinBERT: {combined_df['avg_finbert_sentiment'].corr(combined_df['next_day_returns'])}")
print(f"RoBERTa: {combined_df['avg_roberta_sentiment'].corr(combined_df['next_day_returns'])}")
print(f"LR: {combined_df['avg_lr_sentiment'].corr(combined_df['next_day_returns'])}")

print(f"Market vs Non-Market:")
print(f"Market-related: {combined_df['market_sentiment'].corr(combined_df['Returns'])}")
print(f"Non-market: {combined_df['nonmarket_sentiment'].corr(combined_df['Returns'])}")