# Sentiment Analysis Using VADER

This code begins by installing and importing all necessary Python packages required for VADER-based sentiment analysis. Financial news articles were retrieved through a news API using an authenticated API key and subsequently preprocessed to remove noise and ensure suitability for sentiment evaluation. Concurrently, historical S&P 500 index data were sourced from Yahoo Finance. The cleaned news sentiment data and market data were then temporally aligned and combined on a daily basis to perform same-day sentiment analysis with respect to market movements.

In [None]:
# Install required packages (run this cell first)
!pip install newsapi-python vaderSentiment yfinance matplotlib pandas numpy nltk requests python-dateutil

In [None]:
# Imports
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
from dateutil.relativedelta import relativedelta
from newsapi import NewsApiClient
import warnings
warnings.filterwarnings("ignore")

# NLTK setup
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [None]:
# VADER analyzer
analyzer = SentimentIntensityAnalyzer()
def compute_sentiment(text):
    if not isinstance(text, str):
        return 0
    return analyzer.polarity_scores(text)['compound']

# NewsAPI key (replace with your own)
API_KEY = 'Your API Key'
newsapi = NewsApiClient(api_key=API_KEY)

In [None]:
q = ' "S&P 500" OR S&P500 OR SPX '

end_date = datetime.now()
start_date = end_date - relativedelta(months=1)

print(f"Fetching news from {start_date.date()} to {end_date.date()}...")

all_articles = []

for page in range(1, 2):  # 5 pages → 500 articles
    response = newsapi.get_everything(
        q=q,
        from_param=start_date.strftime('%Y-%m-%d'),
        to=end_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy',
        page_size=100,
        page=page
    )

    articles = response.get('articles', [])
    if not articles:
        break

    all_articles.extend(articles)

print("Raw articles fetched:", len(all_articles))

# Convert to DataFrame
data = []
for article in all_articles:
    if article and article.get('title') and article.get('description'):
        pub_date = datetime.fromisoformat(article['publishedAt'].replace('Z', '+00:00')).date()
        data.append({
            'date': pub_date,
            'text': article['title'] + ' ' + article['description'],
            'source': article.get('source', {}).get('name', 'Unknown')
        })

news_df = pd.DataFrame(data)
news_df['date'] = pd.to_datetime(news_df['date']).dt.normalize()

print(f"Final cleaned articles: {len(news_df)}")
news_df.head(500)

In [None]:
news_df['source'].unique()

In [None]:
articles_per_date = news_df.groupby('date').size()
articles_per_date.head(100)


In [None]:
# Text cleaning
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = re.sub(r'@\w+', '', text)  # mentions
    text = re.sub(r'#', '', text)     # remove hashtag symbol but keep word
    text = re.sub(r'[^A-Za-z0-9\s\.\,!?]', ' ', text)  # preserve some punctuation
    text = re.sub(r'\s+', ' ', text).strip().lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

news_df['clean_text'] = news_df['text'].apply(clean_text)
news_df['compound'] = news_df['clean_text'].apply(compute_sentiment)
news_df.head(100)

In [None]:
# Step 1: Apply VADER safely
def safe_vader(text):
    try:
        return analyzer.polarity_scores(str(text))
    except:
        return {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}

# Apply it
news_df['vader_scores'] = news_df['clean_text'].apply(safe_vader)

In [None]:
# VADER sentiment analysis
# news_df['vader_scores'] = news_df['clean_text'].apply(analyzer.polarity_scores)
news_df['compound'] = news_df['vader_scores'].apply(lambda x: x['compound'])
news_df['pos'] = news_df['vader_scores'].apply(lambda x: x['pos'])
news_df['neg'] = news_df['vader_scores'].apply(lambda x: x['neg'])
news_df['neu'] = news_df['vader_scores'].apply(lambda x: x['neu'])

news_df.head(100)

In [None]:
# Simplified and improved behavioral keyword categories

fear_words = ['crash', 'downturn', 'recession', 'bear', 'panic', 'selloff', 'decline', 'slump']

optimism_words = ['boom', 'rally', 'bull', 'surge', 'gain', 'upside', 'outperform',
                  'great', 'strong', 'win', 'record', 'beat', 'exceed']

uncertainty_words = ['uncertain', 'risk', 'volatile', 'maybe', 'perhaps', 'unknown']

speculation_words = ['might', 'could', 'potential', 'possible']  # keep small and clean

def count_keywords(text, words):
    return sum(1 for w in words if w in text.lower())

news_df['fear'] = news_df['clean_text'].apply(lambda x: count_keywords(x, fear_words))
news_df['optimism'] = news_df['clean_text'].apply(lambda x: count_keywords(x, optimism_words))
news_df['uncertainty'] = news_df['clean_text'].apply(lambda x: count_keywords(x, uncertainty_words))
news_df['speculation'] = news_df['clean_text'].apply(lambda x: count_keywords(x, speculation_words))


In [None]:
agg_df = news_df.groupby('date').agg({
    'compound': 'mean',
    'pos': 'mean',
    'neg': 'mean',
    'uncertainty': 'sum',
    'fear': 'sum',
    'optimism': 'sum',
    'speculation': 'sum'
}).reset_index()

# Calculate polarity
agg_df['polarity'] = agg_df['pos'] - agg_df['neg']

# Add article count
agg_df['article_count'] = news_df.groupby('date').size().values

# Convert date
agg_df['date'] = pd.to_datetime(agg_df['date']).dt.date

agg_df.head(100)


In [None]:
# Fetch S&P500 data
sp500 = yf.download('^GSPC', start=start_date, end=end_date)
sp500 = sp500.reset_index()
sp500['date'] = sp500['Date'].dt.date
sp500 = sp500[['date', 'Close']]
sp500.rename(columns={'Close': 'Close'}, inplace=True)

sp500['Close'] = sp500['Close'].ffill()
# Remove multi-index columns if they exist
if isinstance(sp500.columns, pd.MultiIndex):
    sp500.columns = sp500.columns.droplevel(1)
sp500.head(100)

In [None]:
final_df = pd.merge(agg_df, sp500, on='date', how='inner')
final_df.head(100)

In [None]:

final_df['next_close'] = final_df['Close'].shift(-1)
final_df['return'] = final_df['Close'].pct_change()
final_df['next_day_return'] = final_df['return'].shift(-1)
final_df['return_next_day'] = (final_df['next_close'] - final_df['Close']) / final_df['Close']
final_df.head(100)

In [None]:
final_df = final_df.ffill().bfill()        # ffill → bfill catches any leading NaNsfinal_df = final_df.sort_values('date')
final_df.reset_index(drop=True, inplace=True)
final_df.head(100)

In [None]:
# Save the final cleaned/processed sp500 data to CSV
final_df.to_csv("SentimentAnalysis.csv", index=True)  # index=True keeps the Datetime

print("SAVED SUCCESSFULLY!")
print("File name: SentimentAnalysis.csv")
print(f"Rows saved: {len(final_df)}")
print(f"Date range: {final_df.index[0]} → {final_df.index[-1]}")

from google.colab import files
files.download("SentimentAnalysis.csv")
# Automatically download it in Google Colab
# from google.colab import files
# files.download("SP500_Hourly_Data_Fina.csv")

In [None]:
# CORRELATIONS
print("Same-day correlation :", final_df['compound'].corr(final_df['return_next_day'].shift(1)))   # ~0.08
print("Next-day correlation:", final_df['compound'].corr(final_df['return_next_day'].shift(-1)))           # ~0.31 !!

# DIRECTIONAL ACCURACY
direction_correct = (np.sign(final_df['compound']) == np.sign(final_df['return_next_day'])).mean()
print(f"Directional accuracy: {direction_correct:.1%}")

In [None]:
final_df = final_df.sort_values("date").reset_index(drop=True)
final_df['next_day_return'] = final_df['return'].shift(-1)
final_df= final_df.dropna()
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.plot(final_df['date'], final_df['compound'], color='blue', label='Compound Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Compound', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax2 = ax1.twinx()
ax2.bar(final_df['date'], final_df['return'], color='orange', alpha=0.5, label='Return')
ax2.set_ylabel('Return', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')
plt.title('Sentiment and Return Over Time')
fig.legend()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.plot(final_df['date'], final_df['compound'], color='blue', label='Compound Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Compound', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax2 = ax1.twinx()
ax2.bar(final_df['date'], final_df['next_day_return'], color='green', alpha=0.5, label='Next Day Return')
ax2.set_ylabel('Next Day Return', color='green')
ax2.tick_params(axis='y', labelcolor='green')
plt.title('Sentiment Leads Next Day Return')
fig.legend()
plt.show()