In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    AutoTokenizer, AutoModel, AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 42
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(SEED)

In [4]:
TICKERS        = ['NVDA','AAPL','AMZN','GOOGL','MSFT','META','TSLA']
START_DATE     = '2009-07-01'
END_DATE       = '2025-05-03'
TRANSFORMER    = 'yiyanghkust/finbert-tone'
MAX_LENGTH     = 256
BATCH_SIZE     = 16
LR             = 2e-5
EPOCHS         = 10
TH_UP, TH_DOWN = 0.005, -0.005
TICKER_EMB_DIM = 16
PATIENCE       = 3
OUTPUT_DIR     = './model_output'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
df_nvda  = pd.read_pickle('data/NVDA_article_sentiments.pkl')
df_appl  = pd.read_pickle('data/AAPL_article_sentiments.pkl')
df_amzn  = pd.read_pickle('data/AMZN_article_sentiments.pkl')
df_googl = pd.read_pickle('data/GOOGL_article_sentiments.pkl')
df_msft = pd.read_pickle('data/MSFT_article_sentiments.pkl')
df_meta = pd.read_pickle('data/META_article_sentiments.pkl')
df_tsla = pd.read_pickle('data/TSLA_article_sentiments.pkl')
df_news  = pd.concat([df_nvda, df_appl, df_amzn, df_googl,df_msft,df_meta,df_tsla], axis=0, ignore_index=True)

df_news['date'] = pd.to_datetime(df_news['publish_datetime']).dt.date
# เราจะใช้เฉพาะ title+body_text concatenation แค่ครั้งเดียว
agg_news = df_news.groupby(['ticker','date']).agg({
    'title':     lambda ts: ' '.join(ts),
    'body_text': lambda bs: ' '.join(bs)
}).reset_index()

In [6]:
agg_news

Unnamed: 0,ticker,date,title,body_text
0,AAPL,2021-10-28,"Global stocks fall, U.S. dollar climbs on infl...",By Chibuike Oguh NEW YORK (Reuters) -Global eq...
1,AAPL,2021-10-29,Apple objects to links to outside payments ahe...,By Stephen Nellis (Reuters) - Apple Inc (NASDA...
2,AAPL,2021-10-30,Cuomo attorney says sheriff leaked grand jury ...,By Tim Reid LOS ANGELES (Reuters) - An attorne...
3,AAPL,2021-10-31,Top 5 Things to Watch in Markets in the Week A...,by Daniel Shvartsman Despite high-profile earn...
4,AAPL,2021-11-01,Apple cuts iPad production to feed chips to iP...,(Reuters) - Apple Inc (NASDAQ: ) has cut back ...
...,...,...,...,...
9154,TSLA,2025-04-30,Tesla stock gains after denying CEO search rep...,Investing.com -- Tesla (NASDAQ: ) denied an ov...
9155,TSLA,2025-05-01,Analysis-Tesla without Musk? Board faces uniqu...,"By Rachael Levy, Abhirup Roy, Isla Binnie (Reu..."
9156,TSLA,2025-05-02,"Tesla’s Italy car registrations rise in April,...",Investing.com -- Tesla (NASDAQ: ) has seen an ...
9157,TSLA,2025-05-03,Can Tesla help the U.S. catch up to China in t...,Investing.com -- Tesla (NASDAQ: ) could play a...


In [13]:
import yfinance as yf

price_dfs = []
for tk in TICKERS:
    p = yf.download(tk, start=START_DATE, end=END_DATE, auto_adjust=False)
    # Flatten columns if MultiIndex
    if isinstance(p.columns, pd.MultiIndex):
        p.columns = p.columns.get_level_values(0)
    p = p.rename(columns={
        'Open':'open_t','High':'high_t','Low':'low_t',
        'Close':'close_t','Volume':'volume_t'
    })
    # MA, Momentum, ATR, RSI, MACD_hist
    p['MA5_t']       = p['close_t'].rolling(5).mean()
    p['MA10_t']      = p['close_t'].rolling(10).mean()
    p['momentum5_t'] = p['close_t'] - p['close_t'].shift(5)
    hl = p['high_t'] - p['low_t']
    hc = (p['high_t'] - p['close_t'].shift(1)).abs()
    lc = (p['low_t']  - p['close_t'].shift(1)).abs()
    tr = pd.concat([hl,hc,lc],axis=1).max(axis=1)
    p['ATR14_t']     = tr.rolling(14).mean()
    delta = p['close_t'].diff()
    gain  = delta.clip(lower=0); loss = -delta.clip(upper=0)
    avg_g = gain.rolling(14).mean(); avg_l = loss.rolling(14).mean()
    rs    = avg_g/avg_l
    p['RSI14_t']     = 100 - (100/(1+rs))
    ema12 = p['close_t'].ewm(span=12,adjust=False).mean()
    ema26 = p['close_t'].ewm(span=26,adjust=False).mean()
    macd  = ema12 - ema26
    signal= macd.ewm(span=9,adjust=False).mean()
    p['MACD_hist_t'] = macd - signal
    # target label
    p['close_t+1']        = p['close_t'].shift(-1)
    p['future_return_1d'] = (p['close_t+1'] - p['close_t'])/p['close_t']
    p['label'] = p['future_return_1d'].apply(
        lambda r: 2 if r>TH_UP else 0 if r<TH_DOWN else 1
    )
    req = ['open_t','high_t','low_t','close_t','volume_t',
           'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t',
           'future_return_1d','label']
    p = p.dropna(subset=req)
    p = p.reset_index().rename(columns={'Date':'date'})
    p['date']   = p['date'].dt.date
    p['ticker'] = tk
    price_dfs.append(p)


df_price = pd.concat(price_dfs, ignore_index=True)

[*********************100%***********************]  1 of 1 completed

1 Failed download:
['NVDA']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AMZN']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['GOOGL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['MSFT']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['META']: YFRateLimitError('Too Many Requests. Rate lim

In [8]:
df = pd.merge(agg_news, df_price, on=['ticker','date'], how='inner')

In [9]:
df

Unnamed: 0,ticker,date,title,body_text,Adj Close,close_t,high_t,low_t,open_t,volume_t,MA5_t,MA10_t,momentum5_t,ATR14_t,RSI14_t,MACD_hist_t,close_t+1,future_return_1d,label


In [8]:
df['ticker_idx'] = df['ticker'].astype('category').cat.codes
num_tickers     = df['ticker_idx'].nunique()

# set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# tabular cols
tab_cols = ['open_t','high_t','low_t','close_t','volume_t',
            'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t']

# scale tabular
scaler = StandardScaler()
df[tab_cols] = scaler.fit_transform(df[tab_cols])

# pre-tokenize ALL text (title+body) เพื่อไม่ tokenize ซ้ำในแต่ละ batch
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER, use_fast=True)
texts = (df['title'] + ' ' + df['body_text']).tolist()
enc = tokenizer(texts, padding=True, truncation=True,
                max_length=MAX_LENGTH, return_tensors='pt')

df['input_ids']      = enc['input_ids'].tolist()
df['attention_mask']= enc['attention_mask'].tolist()

# train/test split ตามวัน (time-series)
df = df.sort_values(['date','ticker']).reset_index(drop=True)
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)

# compute class weights
class_counts = train_df['label'].value_counts().sort_index()
class_weights= torch.tensor(class_counts.sum()/class_counts.values,
                            dtype=torch.float).to(device)

In [9]:
class NewsStockDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        return {
            'input_ids':      torch.tensor(row['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long),
            'tabular':        torch.tensor(row[tab_cols].values, dtype=torch.float),
            'ticker_idx':     torch.tensor(row['ticker_idx'], dtype=torch.long),
            'labels':         torch.tensor(row['label'], dtype=torch.long)
        }

train_ds = NewsStockDataset(train_df)
test_ds  = NewsStockDataset(test_df)

# weighted sampler to combat class imbalance
sample_weights = train_df['label'].map(
    lambda x: class_counts.sum()/class_counts[x]
).values
sampler = WeightedRandomSampler(sample_weights,
                                num_samples=len(sample_weights),
                                replacement=True)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE,
                      sampler=sampler, num_workers=2)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE,
                      shuffle=False, num_workers=2)

In [10]:
class MultiModalModel(nn.Module):
    def __init__(self, transformer_name, tab_dim, num_tickers, ticker_emb_dim):
        super().__init__()
        # text encoder
        self.text_enc   = AutoModel.from_pretrained(transformer_name)
        self.text_enc.gradient_checkpointing_enable()   # ลด memory
        txt_dim         = self.text_enc.config.hidden_size
        # tabular MLP
        self.tab_mlp    = nn.Sequential(
            nn.Linear(tab_dim, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32),      nn.ReLU(), nn.Dropout(0.2)
        )
        # ticker embedding
        self.ticker_emb = nn.Embedding(num_tickers, ticker_emb_dim)
        # classifier
        self.classifier = nn.Sequential(
            nn.Linear(txt_dim+32+ticker_emb_dim, 128),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 3)
        )
    def forward(self, input_ids, attention_mask, tabular, ticker_idx):
        txt = self.text_enc(input_ids=input_ids,
                            attention_mask=attention_mask)
        h_text = txt.pooler_output
        h_tab  = self.tab_mlp(tabular)
        h_tk   = self.ticker_emb(ticker_idx)
        x      = torch.cat([h_text,h_tab,h_tk], dim=1)
        return self.classifier(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = MultiModalModel(TRANSFORMER, len(tab_cols),
             num_tickers, TICKER_EMB_DIM).to(device)

# optimizer + scheduler + loss + scaler
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps = EPOCHS * len(train_dl)
scheduler   = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1*total_steps),
    num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss(weight=class_weights)
scaler    = torch.cuda.amp.GradScaler()

  scaler    = torch.cuda.amp.GradScaler()


In [None]:
best_f1, epochs_no_improve = 0, 0
for epoch in range(1, EPOCHS+1):
    model.train(); total_loss = 0
    for batch in train_dl:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['tabular'].to(device),
                batch['ticker_idx'].to(device)
            )
            loss = criterion(logits, batch['labels'].to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()
    # validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_dl:
            logits = model(
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['tabular'].to(device),
                batch['ticker_idx'].to(device)
            )
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(batch['labels'].numpy())
    all_preds  = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average='weighted')
    print(f"Epoch {epoch} — train_loss: {total_loss/len(train_dl):.4f} — val_acc: {acc:.4f} — val_f1: {f1:.4f}")
    # early stopping
    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), os.path.join(OUTPUT_DIR,'best.pt'))
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping")
            break

In [None]:
model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR,'best.pt')))
model.eval()