## Sentiment Analysis

Apply FinBERT and VADER sentiment analysis to the cleaned headlines and merge with stock prices.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
DATA_DIR = ROOT / "Data"
MERGED_PATH = DATA_DIR / "merged_news_prices_cleaned.csv"
OUTPUT_PATH = DATA_DIR / "final_sentiment_dataset.csv"

print(f"Loading data from: {MERGED_PATH}")

Loading data from: d:\Financial News Sentiment Analysis\Data\merged_news_prices_cleaned.csv


In [None]:

df = pd.read_csv(MERGED_PATH, parse_dates=["Date"])


print(f"Loaded {len(df)} rows")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSample data:")
print(df[["Date", "Ticker", "Headlines", "Headlines_clean"]].head())

Loaded 81 rows

Columns: ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Headlines', 'Target', 'Headlines_clean']

Sample data:
        Date Ticker                                          Headlines  \
0 2020-06-09   AAPL  Why Apple's Stock Is Trading Higher Today Appl...   
1 2020-06-09   AMZN  'Inside Amazon's plan to test warehouse worker...   
2 2011-05-23    DNO      American Drivers Should Thank European Voters   
3 2011-06-08    DNO                                   The End of OPEC?   
4 2011-07-01    DNO  Is China's Slowdown Bullish for the Global Eco...   

                                     Headlines_clean  
0  apple stock trading higher today apple could a...  
1  inside amazon plan test warehouse worker covid...  
2               american driver thank european voter  
3                                           end opec  
4              china slowdown bullish global economy  


### FinBERT Sentiment Analysis

Load ProsusAI/finbert model and extract softmax probabilities for positive, negative, and neutral sentiment.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax


model_name = "ProsusAI/finbert"
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print(f"Model loaded on {device}")
print(f"Label mapping: {model.config.id2label}")

Loading ProsusAI/finbert...
Model loaded on cpu
Label mapping: {0: 'positive', 1: 'negative', 2: 'neutral'}


In [None]:
def get_finbert_sentiment(text: str, max_length: int = 512) -> dict:
    """
    Get FinBERT sentiment scores for a text.
    Returns dict with positive, negative, neutral probabilities.
    """
    if not text or not isinstance(text, str):
        return {"finbert_pos": 0.0, "finbert_neg": 0.0, "finbert_neutral": 0.0}
    
    
    if len(text.split()) > max_length:
        text = " ".join(text.split()[:max_length])
    
    try:
        
        inputs = tokenizer(text, return_tensors="pt", truncation=True, 
                          max_length=512, padding=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1)[0]
        
        
        scores = {}
        for idx, prob in enumerate(probs.cpu().numpy()):
            label = model.config.id2label[idx].lower()
            scores[f"finbert_{label}"] = float(prob)
        
        return scores
    except Exception as e:
        print(f"Error processing text: {str(e)[:100]}")
        return {"finbert_pos": 0.0, "finbert_neg": 0.0, "finbert_neutral": 0.0}

print("FinBERT sentiment function defined")

FinBERT sentiment function defined


In [None]:
# Apply FinBERT to cleaned headlines
print("Applying FinBERT sentiment analysis...")
from tqdm.auto import tqdm

finbert_scores = [get_finbert_sentiment(text) for text in tqdm(df["Headlines_clean"], desc="FinBERT")]
finbert_df = pd.DataFrame(finbert_scores)


df = pd.concat([df, finbert_df], axis=1)

print(f"\nFinBERT scores added:")
print(df[["Headlines_clean", "finbert_positive", "finbert_negative", "finbert_neutral"]].head())

Applying FinBERT sentiment analysis...


FinBERT: 100%|██████████| 81/81 [00:08<00:00,  9.32it/s]


FinBERT scores added:
                                     Headlines_clean  finbert_positive  \
0  apple stock trading higher today apple could a...          0.921895   
1  inside amazon plan test warehouse worker covid...          0.802511   
2               american driver thank european voter          0.111590   
3                                           end opec          0.057174   
4              china slowdown bullish global economy          0.081749   

   finbert_negative  finbert_neutral  
0          0.028169         0.049936  
1          0.141895         0.055594  
2          0.055815         0.832595  
3          0.124650         0.818175  
4          0.269374         0.648877  





### VADER Sentiment Analysis

Apply VADER (Valence Aware Dictionary and sEntiment Reasoner) as a baseline and extract the compound score.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer


nltk.download('vader_lexicon', quiet=True)


vader = SentimentIntensityAnalyzer()

def get_vader_compound(text: str) -> float:
    """Get VADER compound sentiment score."""
    if not text or not isinstance(text, str):
        return 0.0
    try:
        scores = vader.polarity_scores(text)
        return scores['compound']
    except:
        return 0.0

print("VADER analyzer initialized")

VADER analyzer initialized


In [8]:
# Apply VADER to cleaned headlines
print("Applying VADER sentiment analysis...")
from tqdm.auto import tqdm

vader_scores = [get_vader_compound(text) for text in tqdm(df["Headlines_clean"], desc="VADER")]
df["vader_compound"] = vader_scores

print(f"\nVADER scores added:")
print(df[["Headlines_clean", "vader_compound"]].head())

Applying VADER sentiment analysis...


VADER: 100%|██████████| 81/81 [00:00<00:00, 2180.34it/s]


VADER scores added:
                                     Headlines_clean  vader_compound
0  apple stock trading higher today apple could a...          0.7530
1  inside amazon plan test warehouse worker covid...          0.3252
2               american driver thank european voter          0.3612
3                                           end opec          0.0000
4              china slowdown bullish global economy          0.0000





### Add Technical Indicators

Calculate common technical indicators from price data.

In [None]:

df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)


def add_technical_indicators(group):
    """Add technical indicators for a single stock."""
    # Simple Moving Averages
    group["SMA_5"] = group["Close"].rolling(window=5, min_periods=1).mean()
    group["SMA_10"] = group["Close"].rolling(window=10, min_periods=1).mean()
    group["SMA_20"] = group["Close"].rolling(window=20, min_periods=1).mean()
    
    # Exponential Moving Average
    group["EMA_12"] = group["Close"].ewm(span=12, adjust=False).mean()
    group["EMA_26"] = group["Close"].ewm(span=26, adjust=False).mean()
    
    # MACD
    group["MACD"] = group["EMA_12"] - group["EMA_26"]
    group["MACD_signal"] = group["MACD"].ewm(span=9, adjust=False).mean()
    
    # RSI (Relative Strength Index)
    delta = group["Close"].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14, min_periods=1).mean()
    rs = gain / loss
    group["RSI"] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands
    group["BB_middle"] = group["Close"].rolling(window=20, min_periods=1).mean()
    bb_std = group["Close"].rolling(window=20, min_periods=1).std()
    group["BB_upper"] = group["BB_middle"] + (2 * bb_std)
    group["BB_lower"] = group["BB_middle"] - (2 * bb_std)
    
    # Price momentum
    group["Price_change"] = group["Close"].pct_change()
    group["Price_change_5d"] = group["Close"].pct_change(periods=5)
    
    return group

print("Calculating technical indicators per ticker...")
df = df.groupby("Ticker", group_keys=False).apply(add_technical_indicators)

print("\nTechnical indicators added")
print(f"New columns: {[c for c in df.columns if c in ['SMA_5', 'EMA_12', 'MACD', 'RSI', 'Price_change']]}")

Calculating technical indicators per ticker...

Technical indicators added
New columns: ['SMA_5', 'EMA_12', 'MACD', 'RSI', 'Price_change']


### Final Dataset

Prepare the final dataset with all features and save to CSV.

In [None]:

print("=" * 80)
print("FINAL DATASET SUMMARY")
print("=" * 80)
print(f"\nTotal rows: {len(df)}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

ticker_list = sorted(df['Ticker'].unique()) if 'Ticker' in df.columns else []
print(f"Tickers: {ticker_list}")
print(f"\nColumns ({len(df.columns)}):")
for col in df.columns:
    print(f"  - {col}")

print("\n" + "=" * 80)
print("SAMPLE DATA")
print("=" * 80)
display(df.head())

print("\n" + "=" * 80)
print("SENTIMENT SCORES SUMMARY")
print("=" * 80)
sentiment_cols = [c for c in ["finbert_positive", "finbert_negative", "finbert_neutral", "vader_compound"] if c in df.columns]
print(df[sentiment_cols].describe())

FINAL DATASET SUMMARY

Total rows: 81
Date range: 2011-04-29 00:00:00 to 2020-06-09 00:00:00
Tickers: []

Columns (27):
  - Date
  - Open
  - High
  - Low
  - Close
  - Adj Close
  - Volume
  - Headlines
  - Target
  - Headlines_clean
  - finbert_positive
  - finbert_negative
  - finbert_neutral
  - vader_compound
  - SMA_5
  - SMA_10
  - SMA_20
  - EMA_12
  - EMA_26
  - MACD
  - MACD_signal
  - RSI
  - BB_middle
  - BB_upper
  - BB_lower
  - Price_change
  - Price_change_5d

SAMPLE DATA


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Headlines,Target,Headlines_clean,...,EMA_12,EMA_26,MACD,MACD_signal,RSI,BB_middle,BB_upper,BB_lower,Price_change,Price_change_5d
0,2020-06-09,83.035004,86.402496,83.002502,85.997498,83.889359,147712400,Why Apple's Stock Is Trading Higher Today Appl...,1,apple stock trading higher today apple could a...,...,85.997498,85.997498,0.0,0.0,,85.997498,,,,
1,2020-06-09,126.472,131.321503,126.25,130.042999,130.042999,103520000,'Inside Amazon's plan to test warehouse worker...,1,inside amazon plan test warehouse worker covid...,...,130.042999,130.042999,0.0,0.0,,130.042999,,,,
2,2011-05-23,38.970001,39.09,38.700001,38.779999,38.779999,13400,American Drivers Should Thank European Voters,0,american driver thank european voter,...,38.779999,38.779999,0.0,0.0,,38.779999,,,,
3,2011-06-08,37.889999,37.889999,37.040001,37.389999,37.389999,38900,The End of OPEC?,1,end opec,...,38.566153,38.677036,-0.110883,-0.022177,0.0,38.084999,40.050755,36.119243,-0.035843,
4,2011-07-01,39.889999,40.16,39.459999,39.650002,39.650002,9100,Is China's Slowdown Bullish for the Global Eco...,1,china slowdown bullish global economy,...,38.732899,38.749107,-0.016209,-0.020983,61.917841,38.606667,40.886522,36.326811,0.060444,



SENTIMENT SCORES SUMMARY
       finbert_positive  finbert_negative  finbert_neutral  vader_compound
count         81.000000         81.000000        81.000000       81.000000
mean           0.182858          0.157338         0.659804        0.111511
std            0.247465          0.263878         0.328735        0.319905
min            0.011334          0.009634         0.018366       -0.954500
25%            0.045680          0.019426         0.483798        0.000000
50%            0.081042          0.034265         0.829882        0.000000
75%            0.158642          0.153841         0.914034        0.340000
max            0.946143          0.954340         0.940571        0.886000


In [11]:
# Save final dataset
df.to_csv(OUTPUT_PATH, index=False)
print(f"\nFinal dataset saved to: {OUTPUT_PATH}")
print(f"File size: {OUTPUT_PATH.stat().st_size / 1024:.2f} KB")


Final dataset saved to: c:\Users\hp\Desktop\NLP course\Financial News Sentiment Analysis\Data\final_sentiment_dataset.csv
File size: 43.02 KB
