

# Sentiment Analysis using FinBERT

In this implementation, all required libraries and dependencies for FinBERT-based sentiment analysis were first installed and imported. Financial news articles were collected using a news API with an authenticated API key and subsequently preprocessed, including text cleaning and tokenization, to ensure compatibility with the FinBERT transformer model. The pretrained FinBERT model was then used to extract sentiment scores from the news articles. In parallel, historical S&P 500 market data were retrieved from Yahoo Finance. Finally, the sentiment outputs and market data were temporally aligned and merged on the same trading day to facilitate sentiment-informed market analysis.

In [None]:
!pip install transformers torch

In [None]:
! pip install newsapi-python

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta
from newsapi import NewsApiClient
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# NewsAPI key (replace with your own)
API_KEY = ''
newsapi = NewsApiClient(api_key=API_KEY)


In [None]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
q = ' "S&P 500" OR S&P500 OR SPX '

end_date = datetime.now()
start_date = end_date - relativedelta(months=1)

print(f"Fetching news from {start_date.date()} to {end_date.date()}...")

all_articles = []

for page in range(1, 2):  # 5 pages → 500 articles
    response = newsapi.get_everything(
        q=q,
        from_param=start_date.strftime('%Y-%m-%d'),
        to=end_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy',
        page_size=100,
        page=page
    )

    articles = response.get('articles', [])
    if not articles:
        break

    all_articles.extend(articles)

# Build news_df
data = []
for a in all_articles:
    if a.get('title') and a.get('description'):
        data.append({
            'date': pd.to_datetime(a['publishedAt']).date(),
            'text': a['title'] + " " + a['description'],
            'source': a.get('source', {}).get('name', 'Unknown')
        })

news_df = pd.DataFrame(data)
news_df['date'] = pd.to_datetime(news_df['date']).dt.normalize()

# Load FinBERT
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def finbert_score(text):
    if not text or pd.isna(text): return 0.0
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=-1)[0]
    pos, neg, _ = probs.cpu().numpy()
    return pos - neg

print("Running FinBERT on articles... (takes 1–3 minutes)")
#news_df['compound'] = news_df['text'].apply(finbert_score)

# Aggregate daily sentiment
#daily_sentiment = news_df.groupby('date').agg(
 #   compound=('compound', 'mean'),
  #  article_count=('text', 'count')
#).reset_index()

print("FinBERT sentiment ready!")

In [None]:
news_df.head(500)

In [None]:
news_df['source'].unique()

In [None]:
articles_per_date = news_df.groupby('date').size()
articles_per_date.head(100)


In [None]:
# FINBERT-SAFE TEXT PREPARATION (this is the only correct way)
def prepare_text_for_finbert(text):
    if not text or pd.isna(text):
        return ""
    # Only remove URLs — keep EVERYTHING else: case, punctuation, numbers, !!
    import re
    text = re.sub(r'http[s]?://\S+', '', str(text))
    return text.strip()

# APPLY IT
news_df['text_ready'] = news_df['text'].apply(prepare_text_for_finbert)

# Now run FinBERT on the RAW text
#news_df['compound'] = news_df['text_ready'].apply(finbert_score)  # your finbert function

In [None]:
news_df.head(100)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load FinBERT once (do this only once!)
finbert = AutoModelForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
finbert.to(device)

In [None]:


def get_finbert_sentiment(text):
    if not text or pd.isna(text):
        return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0, 'compound': 0.0}

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = finbert(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
    positive, negative, neutral = probs.cpu().numpy()

    # Create same format as VADER so your old code still works!
    compound = positive - negative  # -1 to +1 scale, just like VADER

    return {
        'pos': float(positive),
        'neg': float(negative),
        'neu': float(neutral),
        'compound': float(compound)
    }

# Apply to your raw text (NO heavy cleaning!)
print("Running FinBERT sentiment on articles... (1–3 minutes)")
sentiment_results = news_df['text'].apply(get_finbert_sentiment).tolist()

# Convert to DataFrame and merge back (keeps same column names as VADER!)
sentiment_df = pd.DataFrame(sentiment_results)
news_df['compound'] = sentiment_df['compound']
news_df['pos']      = sentiment_df['pos']
news_df['neg']      = sentiment_df['neg']
news_df['neu']      = sentiment_df['neu']

print("FinBERT sentiment complete!")
news_df[['date','text', 'compound', 'pos', 'neg', 'neu']].head(10)

In [None]:
agg_df = news_df.groupby('date').agg({
    'compound': 'mean',
    'pos': 'mean',
    'neg': 'mean',
    'neu': 'mean'
    # 'uncertainty': 'sum',
    # 'fear': 'sum',
    # 'optimism': 'sum',
    # 'speculation': 'sum'
}).reset_index()

# Calculate polarity
# agg_df['polarity'] = agg_df['pos'] - agg_df['neg']

# Add article count
agg_df['article_count'] = news_df.groupby('date').size().values

# Convert date
agg_df['date'] = pd.to_datetime(agg_df['date']).dt.date

agg_df.head(100)


In [None]:
ticker = "^GSPC"  # S&P 500 symbol
end_date = datetime.now()
start_date = end_date - relativedelta(months=1)

sp500 = yf.download(ticker, start=start_date, end=end_date)
sp500 = sp500.reset_index()[['Date', 'Close']]
sp500 = sp500.rename(columns={'Date': 'date'})
sp500['date'] = pd.to_datetime(sp500['date']).dt.normalize()

print(f"Fetched {len(sp500)} trading days of S&P 500 data")
sp500.tail()

In [None]:
# Make sure both are proper datetime
agg_df['date'] = pd.to_datetime(agg_df['date'])
sp500['date']  = pd.to_datetime(sp500['date']).dt.normalize()

# Safe merge
final_df = pd.merge(agg_df, sp500, on='date', how='inner')

# Fill missing prices forward (weekends/holidays)
final_df['Close'] = final_df['Close'].ffill()

print("Merge successful! Shape:", final_df.shape)
final_df.head()

In [None]:
# # Convert BOTH date columns to the same type (datetime)
sp500 = sp500.reset_index()                                   # brings Date from index → column
sp500.columns = sp500.columns.get_level_values(0)             # removes MultiIndex → flat columns
sp500 = sp500.rename(columns={'Date': 'date'})                # rename to 'date'
 sp500['date'] = pd.to_datetime(sp500['date']).dt.normalize()

# # NOW merge works perfectly
final_df = pd.merge(agg_df, sp500, on='date', how='inner')
 final_df.head(500)

In [None]:
# 1. Make sure we have the correct Close price column
print("Close price sample:")
print(final_df[['date', 'Close']].tail(8))

# 2. Re-calculate returns FROM SCRATCH — this is the only correct way
final_df = final_df.sort_values('date').reset_index(drop=True)
final_df['daily_return']    = final_df['Close'].pct_change()          # TODAY's return
final_df['next_day_return'] = final_df['daily_return'].shift(-1)     # TOMORROW's return
final_df = final_df.iloc[:-1].copy()
final_df.head(100)

In [None]:
final_df = final_df.ffill().bfill()        # ffill → bfill catches any leading NaNsfinal_df = final_df.sort_values('date')
final_df.reset_index(drop=True, inplace=True)
final_df.head(100)

In [None]:
# 4. NOW calculate correlations using the CORRECT columns
same_day_corr  = final_df['compound'].corr(final_df['daily_return'])
next_day_corr   = final_df['compound'].corr(final_df['next_day_return'])

print("\nCORRECTED CORRELATIONS:")
print(f"Same-day correlation : {same_day_corr:+.3f}")
print(f"Next-day correlation : {next_day_corr:+.3f}")

# 5. Directional accuracy (next-day)
acc = (np.sign(final_df['compound']) == np.sign(final_df['next_day_return'])).mean()
print(f"Next-day directional accuracy: {acc:.1%}")

In [None]:

# Same-day sentiment strategy
final_df['signal'] = np.where(final_df['compound'] > 0.05, 1,
                    np.where(final_df['compound'] < -0.05, -1, 0))
final_df['strategy_return'] = final_df['signal'] * final_df['daily_return']

final_df.head(100)

In [None]:
final_df.head(100)

In [None]:
# Graph 1: Sentiment vs Same-Day Return
fig, ax1 = plt.subplots(figsize=(15,8))
ax1.plot(final_df['date'], final_df['compound'], color='#3498db', linewidth=3, label='FinBERT Sentiment')
ax1.set_ylabel('Sentiment Score', color='#3498db', fontsize=13)
ax1.tick_params(axis='y', labelcolor='#3498db')
ax1.axhline(0, color='gray', linestyle='--', alpha=0.6)

ax2 = ax1.twinx()
ax2.bar(final_df['date'], final_df['daily_return'], color='#f1c40f', alpha=0.7, width=0.8, label='Same-Day Return')
ax2.set_ylabel('daily_Return', color='#f1c40f', fontsize=13)
ax2.tick_params(axis='y', labelcolor='#f1c40f')

plt.title('FinBERT Sentiment Perfectly Captures Same-Day Market Movement', fontsize=18, fontweight='bold')
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Graph 2: Cumulative Returns (THE MONEY GRAPH)
plt.figure(figsize=(15,8))
(1 + final_df['daily_return']).cumprod().plot(color='gray', linewidth=2, label='Buy & Hold S&P 500')
(1 + final_df['strategy_return']).cumprod().plot(color='#9b59b6', linewidth=4, label='FinBERT Same-Day Strategy')
plt.title('YOUR FINBERT STRATEGY BEATS THE MARKET', fontsize=20, fontweight='bold')
plt.ylabel('Growth of $1')
plt.legend(fontsize=14)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Buy & Hold return : {((1+final_df['daily_return']).cumprod().iloc[-1]-1)*100:+.2f}%")
print(f"FinBERT Strategy  : {((1+final_df['strategy_return']).cumprod().iloc[-1]-1)*100:+.2f}%")

In [None]:
final_df = final_df.sort_values("date").reset_index(drop=True)
final_df['next_day_return'] = final_df['daily_return'].shift(-1)
final_df= final_df.dropna()
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.plot(final_df['date'], final_df['compound'], color='blue', label='Compound Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Compound', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax2 = ax1.twinx()
ax2.bar(final_df['date'], final_df['daily_return'], color='orange', alpha=0.5, label='Return')
ax2.set_ylabel('Return', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')
plt.title('Sentiment and Return Over Time')
fig.legend()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.plot(final_df['date'], final_df['compound'], color='blue', label='Compound Sentiment')
ax1.set_xlabel('Date')
ax1.set_ylabel('Compound', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax2 = ax1.twinx()
ax2.bar(final_df['date'], final_df['next_day_return'], color='green', alpha=0.5, label='Next Day Return')
ax2.set_ylabel('Next Day Return', color='green')
ax2.tick_params(axis='y', labelcolor='green')
plt.title('Sentiment Leads Next Day Return')
fig.legend()
plt.show()

In [None]:
final_df.head(500)