In [1]:
from dataset import load_and_process_data, create_stock_dataset
from model import Decoder, combined_rotary_embedding, custom_mask, causal_mask
from bit.bitlinear import replace_with_bitnet_linear
from bit.RMSNorm import RMSNorm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
past_days = 29
days = 30
# 超参数
lr = 1e-4
steps = 1
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
data = load_and_process_data()
stock_dataset = create_stock_dataset(data,seq_length=days)
train_loader = DataLoader(stock_dataset, batch_size=batch_size, shuffle=False,pin_memory=True)

In [4]:
# 输出loader的数据
for i in train_loader:
    print(i[0].shape)
    break

torch.Size([1650, 12])


In [5]:
len(train_loader)

7155

In [6]:
args = {
    "d_model": 128,
    "n_head": 4,
    "dim_feedforward": 128,
    "num_layers": 3,
    "dropout": 0.1,
}
# torch模型 list
# @torch.compile
class MyModel(nn.Module):
    def __init__(self,d_model,n_head,dim_feedforward,num_layers,dropout):
        super().__init__()
        self.fc1 = nn.Linear(12, d_model)
        self.norm = RMSNorm(d_model)
        self.decoder = Decoder(**args)
        self.lm_head = nn.Linear(d_model,1)

    def forward(self, x, positions_ids,mask):
        x = self.fc1(x)
        x = self.norm(x)
        x = self.decoder(x,positions_ids,mask)
        x = x[:, -1, :]
        x = self.lm_head(x)
        return x
model = MyModel(**args)

In [7]:
model = replace_with_bitnet_linear(model)

成功替换了所有 nn.Linear 层为 BitLinear 层。


In [8]:
causal = causal_mask(days*55)
pos = custom_mask(days*55,12,past_days,55)
positions_ids = combined_rotary_embedding(30, 55, args['d_model']*2//args['n_head'],500,20000)

In [9]:
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.L1Loss()

In [10]:
causal , pos ,positions_ids = causal.to(device=device) , pos.to(device=device) ,tuple(i.to(device=device) for i in positions_ids) 

In [11]:
from accelerate import Accelerator
accelerator = Accelerator()

device = accelerator.device
model, optimizer, train_loader, = accelerator.prepare(
    model, optimizer, train_loader, 
)

In [12]:
train_loader.device, pos.device


(device(type='cuda'), device(type='cuda', index=0))

In [None]:
# 训练,用tqdm显示进度

model.train()

for step in range(steps):
    total_loss = 0  # 累积损失
    batch_count = 0  # 计算 batch 数量

    with tqdm(train_loader, desc=f"Step {step+1}/{steps}", unit="batch") as train_pbar:
        for x in train_pbar:  # tqdm 追踪 train_loader
            batch_loss = 0  # 追踪当前 batch 的损失
            for i in range((days - past_days) * 55):
                optimizer.zero_grad()
                y = x[:, past_days * 55 + i, -1].unsqueeze(-1)  # 确保 y 形状匹配 y_pred
                input = x * pos[i]  
                y_pred = model(input, positions_ids, causal)  # 模型预测

                loss = criterion(y_pred, y)  # 计算损失
                loss.backward()  # 反向传播
                optimizer.step()  # 更新参数

                batch_loss += loss.item()  # 记录当前 batch 损失
                total_loss += loss.item()  # 记录总损失
                batch_count += 1  # 增加 batch 计数
            
            avg_batch_loss = batch_loss / ((days - past_days) * 55)  # 计算当前 batch 的平均损失
            train_pbar.set_postfix(loss=f"{avg_batch_loss:.4f}")  # 在进度条旁边显示 batch 级别的损失

    avg_loss = total_loss / batch_count if batch_count > 0 else 0
    tqdm.write(f"Step {step+1}/{steps}, Average Loss: {avg_loss:.4f}")  # 每个 step 结束后打印平均损失


Step 1/1:   0%|          | 14/7155 [02:18<19:29:36,  9.83s/batch, loss=5.4365]