In [1]:
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.preprocessing import StandardScaler


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df_nvda = pd.read_pickle('data/NVDA_article_sentiments.pkl')
df_appl = pd.read_pickle('data/AAPL_article_sentiments.pkl')
df_amzn = pd.read_pickle('data/AMZN_article_sentiments.pkl')
df_googl = pd.read_pickle('data/GOOGL_article_sentiments.pkl')

In [None]:
df_news = pd.concat([df_nvda, df_appl, df_amzn, df_googl], axis=0)
df_news.reset_index(drop=True, inplace=True)

In [15]:
df_news

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,NVDA,Deepseek Releases New Math AI Model,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/deepseek-re...,0.076,0.893,0.032,-0.8437,2025-04-30 07:25:00
1,NVDA,Adv Micro Device receives Investment Bank Anal...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00
2,NVDA,Nvidia Corp receives Investment Bank Analyst R...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00
3,NVDA,Taiwan’s ASE: evaluating how it will support N...,By Wen-Yee Lee and Ben Blanchard TAIPEI (Reute...,https://www.investing.com/news/stock-market-ne...,0.032,0.883,0.084,0.9423,2025-04-30 06:56:00
4,NVDA,"Super Micro slumps on forecast cut, analysts d...",By Aditya Soni (Reuters) -Super Micro Computer...,https://www.investing.com/news/stock-market-ne...,0.099,0.779,0.122,0.9432,2025-04-30 06:47:00
...,...,...,...,...,...,...,...,...,...
38896,GOOGL,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim Investing.com – The S&P 500 f...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750,2021-05-04 15:57:00
38897,GOOGL,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813,2021-05-04 15:54:00
38898,GOOGL,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim Investing.com – The S&P 500 t...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480,2021-05-04 14:03:00
38899,GOOGL,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820,2021-05-04 07:13:00


In [62]:

# 1. Hyperparameters & config
TICKERS      = ['NVDA', 'AAPL', 'AMZN', 'GOOGL']
START_DATE  = '2009-07-01'
END_DATE    = '2025-05-03'
TRANSFORMER = 'yiyanghkust/finbert-tone'
MAX_LENGTH   = 256
BATCH_SIZE   = 16
LR           = 2e-5
EPOCHS       = 3
TH_UP        = 0.005
TH_DOWN      = -0.005
TICKER_EMB_DIM = 16    

In [23]:
df_news['date'] = df_news['publish_datetime'].dt.date
# Aggregate per day: concatenate all titles/body_texts
agg_news = df_news.groupby(['ticker','date']).agg({
    'title':     lambda t: ' '.join(t),
    'body_text': lambda b: ' '.join(b)
}).reset_index()

In [35]:
agg_news

Unnamed: 0,ticker,date,title,body_text
0,AAPL,2021-10-28,"Global stocks fall, U.S. dollar climbs on infl...",By Chibuike Oguh NEW YORK (Reuters) -Global eq...
1,AAPL,2021-10-29,Apple objects to links to outside payments ahe...,By Stephen Nellis (Reuters) - Apple Inc (NASDA...
2,AAPL,2021-10-30,Cuomo attorney says sheriff leaked grand jury ...,By Tim Reid LOS ANGELES (Reuters) - An attorne...
3,AAPL,2021-10-31,Top 5 Things to Watch in Markets in the Week A...,by Daniel Shvartsman Despite high-profile earn...
4,AAPL,2021-11-01,Apple cuts iPad production to feed chips to iP...,(Reuters) - Apple Inc (NASDAQ: ) has cut back ...
...,...,...,...,...
5343,NVDA,2025-04-26,Is AI revolution under threat from tariffs? By...,Investing.com -- The breakneck acceleration of...
5344,NVDA,2025-04-27,Nvidia dips on report of AI chip competition f...,Updates to clarify shares fell in 24 hours tra...
5345,NVDA,2025-04-28,"Trump to tout US investments from Nvidia, J&J,...",By David Shepardson WASHINGTON (Reuters) -CEOs...
5346,NVDA,2025-04-29,TSMC breaks ground on third chip facility in A...,Investing.com-- Taiwan Semiconductor Manufactu...


In [52]:
import yfinance as yf
import pandas as pd

price_dfs = []
for tk in TICKERS:
    p = yf.download(tk, start=START_DATE, end=END_DATE)

    if isinstance(p.columns, pd.MultiIndex):
        p.columns = p.columns.get_level_values(0)

    p = p.rename(columns={
        'Open':  'open_t','High': 'high_t',
        'Low':   'low_t', 'Close':'close_t',
        'Volume':'volume_t'
    })

    # Moving averages
    p['MA5_t']        = p['close_t'].rolling(5).mean()
    p['MA10_t']       = p['close_t'].rolling(10).mean()
    # Momentum 5-day
    p['momentum5_t']  = p['close_t'] - p['close_t'].shift(5)
    # ATR14
    hl = p['high_t'] - p['low_t']
    hc = (p['high_t'] - p['close_t'].shift(1)).abs()
    lc = (p['low_t']  - p['close_t'].shift(1)).abs()
    tr = pd.concat([hl,hc,lc], axis=1).max(axis=1)
    p['ATR14_t']      = tr.rolling(14).mean()
    # RSI14
    delta   = p['close_t'].diff()
    gain    = delta.clip(lower=0)
    loss    = -delta.clip(upper=0)
    avg_g   = gain.rolling(14).mean()
    avg_l   = loss.rolling(14).mean()
    rs      = avg_g / avg_l
    p['RSI14_t']      = 100 - (100/(1+rs))
    # MACD histogram
    ema12    = p['close_t'].ewm(span=12,adjust=False).mean()
    ema26    = p['close_t'].ewm(span=26,adjust=False).mean()
    macd     = ema12 - ema26
    signal   = macd.ewm(span=9,adjust=False).mean()
    p['MACD_hist_t']  = macd - signal

    # Target & label
    p['close_t+1']        = p['close_t'].shift(-1)
    p['future_return_1d'] = (p['close_t+1'] - p['close_t'])/p['close_t']
    p['label'] = p['future_return_1d'].apply(
        lambda r: 2 if r>TH_UP else 0 if r<TH_DOWN else 1
    )
    # Drop rows with NaN in required cols
    req = ['open_t','high_t','low_t','close_t','volume_t',
           'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t',
           'future_return_1d','label']
    p = p.dropna(subset=req)
    p = p.reset_index().rename(columns={'Date':'date'})
    p['date']   = p['date'].dt.date
    p['ticker'] = tk
    price_dfs.append(p)

df_price = pd.concat(price_dfs, ignore_index=True)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [53]:
df = pd.merge(agg_news, df_price,
              on=['ticker','date'], how='inner')

In [58]:
import numpy as np
from sklearn.model_selection import train_test_split

df['ticker_idx'] = df['ticker'].astype('category').cat.codes
num_tickers     = df['ticker_idx'].nunique()

# 6. Scale tabular features
tab_cols = ['open_t','high_t','low_t','close_t','volume_t',
            'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t']
scaler = StandardScaler()
df[tab_cols] = scaler.fit_transform(df[tab_cols])

# 7. Split train/test (ตามลำดับเวลา)
df = df.sort_values(['ticker','date']).reset_index(drop=True)
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)

In [59]:
class NewsStockDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        self.tk = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        # tokenize news text
        text = row['title'] + ' ' + row['body_text']
        toks = self.tk(text,
                      padding='max_length',
                      truncation=True,
                      max_length=MAX_LENGTH,
                      return_tensors='pt')
        # tabular features
        tab = torch.from_numpy(
            row[tab_cols].values.astype(np.float32)
        )
        # ticker embedding index
        ti = torch.tensor(row['ticker_idx'], dtype=torch.long)
        # label
        lbl = torch.tensor(row['label'], dtype=torch.long)
        return {
            'input_ids':      toks.input_ids.squeeze(0),
            'attention_mask': toks.attention_mask.squeeze(0),
            'tabular':        tab,
            'ticker_idx':     ti,
            'label':          lbl
        }

In [60]:
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER, use_fast=True)
train_ds  = NewsStockDataset(train_df, tokenizer)
test_ds   = NewsStockDataset(test_df,  tokenizer)
train_dl  = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl   = DataLoader(test_ds,  batch_size=BATCH_SIZE)

In [63]:
class MultiModalModel(nn.Module):
    def __init__(self, transformer_name, tab_dim, num_tickers, ticker_emb_dim):
        super().__init__()
        # text encoder
        self.text_enc   = AutoModel.from_pretrained(transformer_name)
        txt_dim         = self.text_enc.config.hidden_size
        # tabular MLP
        self.tab_mlp    = nn.Sequential(
            nn.Linear(tab_dim, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32),      nn.ReLU(), nn.Dropout(0.2)
        )
        # ticker embedding
        self.ticker_emb = nn.Embedding(num_tickers, ticker_emb_dim)
        # fusion & classifier
        self.classifier = nn.Sequential(
            nn.Linear(txt_dim + 32 + ticker_emb_dim, 128),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 3)
        )

    def forward(self, ids, mask, tab, ti):
        txt = self.text_enc(input_ids=ids, attention_mask=mask)
        h_text = txt.pooler_output             # (B, txt_dim)
        h_tab  = self.tab_mlp(tab)             # (B, 32)
        h_tk   = self.ticker_emb(ti)           # (B, ticker_emb_dim)
        x      = torch.cat([h_text, h_tab, h_tk], dim=1)
        return self.classifier(x)

device   = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model    = MultiModalModel(
    TRANSFORMER, tab_dim=len(tab_cols),
    num_tickers=num_tickers,
    ticker_emb_dim=TICKER_EMB_DIM
).to(device)
optimizer = AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()



In [65]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for b in train_dl:
        optimizer.zero_grad()
        ids  = b['input_ids'].to(device)
        m    = b['attention_mask'].to(device)
        tab  = b['tabular'].to(device)
        ti   = b['ticker_idx'].to(device)
        lbl  = b['label'].to(device)
        logits = model(ids, m, tab, ti)
        loss   = criterion(logits, lbl)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{EPOCHS} — Loss {total_loss/len(train_dl):.4f}')

KeyboardInterrupt: 

In [None]:
model.eval()
correct = total = 0
with torch.no_grad():
    for b in test_dl:
        ids  = b['input_ids'].to(device)
        m    = b['attention_mask'].to(device)
        tab  = b['tabular'].to(device)
        ti   = b['ticker_idx'].to(device)
        lbl  = b['label'].to(device)
        preds = model(ids, m, tab, ti).argmax(dim=1)
        correct += (preds == lbl).sum().item()
        total   += lbl.size(0)
print(f'Test Accuracy: {correct/total:.2%}')

Test Accuracy: 44.14%
