In [1]:
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.preprocessing import StandardScaler


  from .autonotebook import tqdm as notebook_tqdm


In [6]:

# 1. Hyperparameters & config
TICKER      = 'NVDA'
START_DATE  = '2009-07-01'
END_DATE    = '2025-05-03'
TRANSFORMER = 'yiyanghkust/finbert-tone'
MAX_LENGTH  = 256
BATCH_SIZE  = 16
LR          = 2e-5
EPOCHS      = 3
TH_UP       = 0.005
TH_DOWN     = -0.005

In [4]:
df_news = pd.read_pickle('nvda_article_sentiments.pkl')

In [12]:
df_news['date'] = df_news['publish_datetime'].dt.date
# Aggregate per day: concatenate all titles/body_texts
agg_news = df_news.groupby(['ticker','date']).agg({
    'title':     lambda t: ' '.join(t),
    'body_text': lambda b: ' '.join(b)
}).reset_index()

In [20]:
import yfinance as yf
import pandas as pd

# 1. Download historical data
ticker = 'NVDA'
start_date = '2009-07-01'
end_date   = '2025-05-03'

df = yf.download(ticker, start=start_date, end=end_date)

# 1.1 Flatten MultiIndex columns if present
if isinstance(df.columns, pd.MultiIndex):
    df.columns = df.columns.get_level_values(0)

# 2. Rename columns for consistency
df = df.rename(columns={
    'Close':  'close_t',
    'Volume': 'volume_t',
    'High':   'high_t',
    'Low':    'low_t',
    'Open':   'open_t'
})

# 3. Moving Averages
df['MA5_t']  = df['close_t'].rolling(window=5).mean()
df['MA10_t'] = df['close_t'].rolling(window=10).mean()

# 4. Momentum (5-day)
df['momentum5_t'] = df['close_t'] - df['close_t'].shift(5)

# 5. ATR14 (14-day Average True Range)
high_low        = df['high_t'] - df['low_t']
high_close_prev = (df['high_t'] - df['close_t'].shift(1)).abs()
low_close_prev  = (df['low_t']  - df['close_t'].shift(1)).abs()
true_range      = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
df['ATR14_t']   = true_range.rolling(window=14).mean()

# 6. RSI14 (14-day Relative Strength Index)
delta     = df['close_t'].diff()
gain      = delta.where(delta > 0, 0)
loss      = -delta.where(delta < 0, 0)
avg_gain  = gain.rolling(window=14).mean()
avg_loss  = loss.rolling(window=14).mean()
rs        = avg_gain / avg_loss
df['RSI14_t'] = 100 - (100 / (1 + rs))

# 7. MACD Histogram
ema12            = df['close_t'].ewm(span=12, adjust=False).mean()
ema26            = df['close_t'].ewm(span=26, adjust=False).mean()
macd             = ema12 - ema26
signal           = macd.ewm(span=9, adjust=False).mean()
df['MACD_hist_t'] = macd - signal

# 8. Compute future_return_1d and label
df['close_t+1']        = df['close_t'].shift(-1)
df['future_return_1d'] = (df['close_t+1'] - df['close_t']) / df['close_t']

th_up   = 0.005
th_down = -0.005
def assign_label(r):
    if r > th_up:   return 2  # Buy
    elif r < th_down: return 0  # Sell
    else:           return 1  # Hold
df['label'] = df['future_return_1d'].apply(assign_label)

# 9. Drop any row with NaN in critical columns
required_cols = [
    'close_t','volume_t','MA5_t','MA10_t',
    'momentum5_t','ATR14_t','RSI14_t','MACD_hist_t',
    'close_t+1','future_return_1d','label'
]
df = df.dropna(subset=required_cols)


# Preview the final DataFrame
print(df[['close_t','close_t+1','future_return_1d','label']].tail())

[*********************100%***********************]  1 of 1 completed

Price          close_t   close_t+1  future_return_1d  label
Date                                                       
2025-04-25  111.010002  108.730003         -0.020539      0
2025-04-28  108.730003  109.019997          0.002667      1
2025-04-29  109.019997  108.919998         -0.000917      1
2025-04-30  108.919998  111.610001          0.024697      2
2025-05-01  111.610001  114.500000          0.025894      2





In [21]:
df_price = df.copy()

In [22]:
# — ตรงส่วนเตรียม df_price ก่อน merge —
# 1) ถ้า columns ของ df_price เป็น MultiIndex (เช่น มาจากการ download หลายตัว ticker)
if isinstance(df_price.columns, pd.MultiIndex):
    # ให้ flatten มาเป็นระดับเดียว (เอาชื่อ field เดียว)
    df_price.columns = df_price.columns.get_level_values(-1)

# 2) reset_index() เพื่อย้าย index (Date) มาเป็นคอลัมน์
df_price = df_price.reset_index()

# 3) เปลี่ยนชื่อคอลัมน์ 'Date' → 'date'
df_price = df_price.rename(columns={'Date': 'date'})

# 4) แปลง type ของ date ให้เป็น datetime.date
df_price['date'] = pd.to_datetime(df_price['date']).dt.date

# 5) เพิ่มคอลัมน์ ticker (กรณีเป็นหุ้นตัวเดียว)
df_price['ticker'] = TICKER

# — จากนั้นจึงค่อย merge —
df = pd.merge(
    agg_news,
    df_price,
    on=['ticker','date'],
    how='inner'
)


In [25]:
df

Unnamed: 0,ticker,date,title,body_text,close_t,high_t,low_t,open_t,volume_t,MA5_t,MA10_t,momentum5_t,ATR14_t,RSI14_t,MACD_hist_t,close_t+1,future_return_1d,label
0,NVDA,2009-08-07,US STOCKS-July jobs data lifts stock indexes m...,"* Retailers, financials advance; AIG's stock j...",-0.872970,-0.872629,-0.872799,-0.871918,2.971420,-0.872278,-0.870829,-0.073002,-0.791594,1.064375,0.018069,0.303986,-0.032823,0
1,NVDA,2011-05-13,"US STOCKS-Wall Street edges down, eyes confide...",* April U.S. CPI rises in line with expectatio...,-0.870563,-0.870024,-0.870313,-0.869081,5.068027,-0.868960,-0.867545,-0.082949,-0.788758,-0.551580,0.016277,0.405773,-0.030668,0
2,NVDA,2011-08-11,US STOCKS-Wall St roars back but selling may r...,* Europe rebounds on upcoming Sarkozy-Merkel m...,-0.873129,-0.872961,-0.873437,-0.872705,1.884704,-0.872631,-0.870667,-0.077165,-0.788913,-0.888866,0.012973,0.295274,-0.039523,0
3,NVDA,2011-08-12,"US STOCKS-Wall St rises on lower volume, mixed...","* Retail sales top view, consumer sentiment wo...",-0.873410,-0.872302,-0.873258,-0.871385,9.005445,-0.872638,-0.870717,-0.077597,-0.787568,-0.970405,0.013653,0.306508,0.038044,2
4,NVDA,2011-08-23,US STOCKS-Hopes for another Fed rescue drive 3...,* Weak data triggers more bets for Fed action ...,-0.873224,-0.873142,-0.873507,-0.872763,1.412184,-0.872833,-0.871089,-0.075921,-0.786803,-0.783904,0.016169,0.298942,-0.014361,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,NVDA,2025-04-24,"Asia tech, chipmaking stocks rise on Alphabet ...",Investing.com-- Asian technology and chipmakin...,1.576047,1.531144,1.550546,1.505344,-1.001198,1.463892,1.562778,0.380280,3.175337,-0.109102,0.550101,111.010002,0.043033,2
1444,NVDA,2025-04-25,Why is Strategy Up While Tech is Down? Michael...,"U.Today - Michael Saylor, the CEO and co-found...",1.681747,1.652901,1.612308,1.583001,-0.899450,1.507941,1.570756,2.167824,3.057370,0.448907,1.525581,108.730003,-0.020539,0
1445,NVDA,2025-04-28,"Trump to tout US investments from Nvidia, J&J,...",By David Shepardson WASHINGTON (Reuters) -CEOs...,1.629128,1.617822,1.619145,1.648446,-1.045284,1.562633,1.565654,2.710218,2.715604,0.202234,1.859826,109.019997,0.002667,1
1446,NVDA,2025-04-29,TSMC breaks ground on third chip facility in A...,Investing.com-- Taiwan Semiconductor Manufactu...,1.635821,1.613975,1.652619,1.601897,-1.170626,1.609504,1.561734,2.311675,2.424731,0.287578,2.057643,108.919998,-0.000917,1


In [41]:
import numpy as np
# 7. Scale tabular features
tab_cols = [
    'open_t','high_t','low_t','close_t','volume_t',
    'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t'
]
df[tab_cols] = df[tab_cols].astype(np.float32)
scaler = StandardScaler()
df[tab_cols] = scaler.fit_transform(df[tab_cols])

In [48]:
# 8. PyTorch Dataset & DataLoader
class NewsStockDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df         = df.reset_index(drop=True)
        self.tokenizer  = tokenizer
        self.tabular_cols = tab_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['title'] + ' ' + row['body_text']
        tokens = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors='pt'
        )
        # Tabular part: บังคับแปลง dtype ที่นี่เลย
        vals = row[self.tabular_cols].values
        vals = np.array(vals, dtype=np.float32)   # <— แปลงเป็น float32
        tab  = torch.from_numpy(vals)             # สร้าง Tensor จาก numpy array

        # Label
        label = torch.tensor(row['label'], dtype=torch.long)
        return {
            'input_ids':     tokens.input_ids.squeeze(0),
            'attention_mask':tokens.attention_mask.squeeze(0),
            'tabular':       tab,
            'label':         label
    }

In [49]:
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER, use_fast=True)
# split train/test by date (e.g. last 20% for test)
split = int(len(df)*0.8)
train_df, test_df = df.iloc[:split], df.iloc[split:]

train_ds = NewsStockDataset(train_df, tokenizer)
test_ds  = NewsStockDataset(test_df,  tokenizer)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE)


In [54]:
class MultiModalModel(nn.Module):
    def __init__(self, transformer_name, tab_input_dim):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(transformer_name)
        text_dim = self.text_encoder.config.hidden_size
        self.tab_mlp = nn.Sequential(
            nn.Linear(tab_input_dim, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32),              nn.ReLU(), nn.Dropout(0.2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(text_dim + 32, 128),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 3)
        )

    def forward(self, input_ids, attention_mask, tabular):
        txt = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = txt.pooler_output                  # (batch, hidden_size)
        tab_emb  = self.tab_mlp(tabular)              # (batch, 32)
        x = torch.cat([text_emb, tab_emb], dim=1)     # (batch, hidden+32)
        return self.classifier(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = MultiModalModel(TRANSFORMER, len(tab_cols)).to(device)
optimizer = AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()



In [None]:
# 10. Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_dl:
        optimizer.zero_grad()
        ids   = batch['input_ids'].to(device)
        mask  = batch['attention_mask'].to(device)
        tab   = batch['tabular'].to(device)
        lbl   = batch['label'].to(device)
        logits= model(ids, mask, tab)
        loss  = criterion(logits, lbl)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{EPOCHS} — Loss: {total_loss/len(train_dl):.4f}')

In [None]:
# 11. Evaluation
model.eval()
correct = 0
total   = 0
with torch.no_grad():
    for batch in test_dl:
        ids   = batch['input_ids'].to(device)
        mask  = batch['attention_mask'].to(device)
        tab   = batch['tabular'].to(device)
        lbl   = batch['label'].to(device)
        logits= model(ids, mask, tab)
        preds = logits.argmax(dim=1)
        correct += (preds == lbl).sum().item()
        total   += lbl.size(0)
print(f'Test Accuracy: {correct/total:.2%}')