In [5]:
from model import build_gpt_model
from sklearn.model_selection import train_test_split
import torch
from dataset import GPTChatDataset
from tokenizer_utils import TokenizerEnVi
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import os

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEQ_LEN = 256

In [7]:


from convert import load
lines = load()


In [8]:
print(len(lines))

145206


In [9]:
train_lines, test_lines = train_test_split(lines, test_size=0.2, random_state = 42)


In [10]:
print(len(train_lines))

116164


In [11]:
tokenizer = TokenizerEnVi()
tokenizer.build_vocab(train_lines)

pad_id = tokenizer.word2idx["<pad>"]

train_dataset = GPTChatDataset(train_lines, tokenizer, seq_len= SEQ_LEN )
test_dataset = GPTChatDataset(test_lines, tokenizer, seq_len= SEQ_LEN)



In [12]:
from statistics import mean

lengths = [len(tokenizer.tokenize(line)) for line in lines]
print("Trung b√¨nh:", mean(lengths), "| Max:", max(lengths), "| Min:", min(lengths))

Trung b√¨nh: 128.83127418977176 | Max: 1552 | Min: 4


In [13]:
import pickle

with open("saved/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [14]:
def collate_fn(batch):
    decoder_input = torch.stack([item["decoder_input"] for item in batch])
    decoder_mask = torch.stack([item["decoder_mask"] for item in batch])
    label = torch.stack([item["label"] for item in batch])

    return {
        "decoder_input": decoder_input,      # (batch_size, seq_len)
        "decoder_mask": decoder_mask,        # (batch_size, 1, seq_len, seq_len)
        "label": label                       # (batch_size, seq_len)
    }

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [15]:
for word in tokenizer.word2idx:
    print(word)

<pad>
<unk>
<sos>
<eos>
USER
chi_ph√≠
s·ªëng
·ªü
vancouver
l√†
bao_nhi√™u
?
AI
sinh_ho·∫°t
thay_ƒë·ªïi
d·ª±a
tr√™n
m·ªôt_s·ªë
y·∫øu_t·ªë
bao_g·ªìm
nh√†
,
giao_th√¥ng
th·ª±c_ph·∫©m
v√†
c√°c
kh√°c
theo
numbeo
cung_c·∫•p
ch·ªâ_s·ªë
sinh_ho·∫°t_t√≠nh
ƒë·∫øn
th√°ng
nƒÉm
h√†ng
∆∞·ªõc_t√≠nh
cho
m·ªôt
ng∆∞·ªùi
m√†
kh√¥ng
c·∫ßn
thu√™
kho·∫£ng
cad
cƒÉn_h·ªô
ph√≤ng
ng·ªß
trung_t√¢m
th√†nh_ph·ªë
m·ªói
.
ƒëi·ªÅu
quan_tr·ªçng
l∆∞u_√Ω
ƒë√¢y
trung_b√¨nh
th·ª±c_t·∫ø
c√≥_th·ªÉ
l·ªëi
ho√†n_c·∫£nh
c√°_nh√¢n
x√°c_ƒë·ªãnh
c√¢u
tr·∫£_l·ªùi
ƒë√∫ng
t·ª´
l·ª±a_ch·ªçn
b√™n
d∆∞·ªõi
v√≠_d·ª•
v·ªÅ
ƒë·ªôc_quy·ªÅn
a
google
b
apple
c
microsoft
d
twitter
c√≥
trong
ho·∫∑c
thu·∫ßn
t√∫y
h√£y
ƒë∆∞a
ra
ngu·ªìn
d·ªØ_li·ªáu
ƒë∆∞·ª£c
s·ª≠_d·ª•ng
m√¥_h√¨nh_h·ªçc
m√°y
b·ªô
s∆∞u_t·∫≠p
h√¨nh_·∫£nh
ƒë·ªÉ
hu·∫•n_luy·ªán
nh·∫±m
nh·∫≠n_di·ªán
ƒë·ªëi_t∆∞·ª£ng
b·ª©c
·∫£nh
th·ª±c_hi·ªán
khu√¥n_m·∫∑t
ph√°t_hi·ªán
chuy·ªÉn_ƒë·ªông
vƒÉn_b·∫£n
t√†i_li·ªáu
video
√¢m_thanh
c·∫•u_tr√∫c
nh∆∞
gi√°
c·ªï_phi·∫øu
th·ªùi_ti·∫øt
t√¥i
ng·∫°n_ng·ªØ
ng√¥

In [16]:
model = build_gpt_model(len(tokenizer.word2idx), seq_len= SEQ_LEN).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [17]:
n_epochs = 10
best_val_loss = float("inf")
save_path     = "saved/best_model.pth"
os.makedirs("saved", exist_ok=True)
for epoch in range(n_epochs):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"[Epoch {epoch}] Validating"):
        decoder_input = batch["decoder_input"].to(device)
        decoder_mask = batch["decoder_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward
        output = model(decoder_input, decoder_mask)  

        # Loss
        output = output.view(-1, output.shape[-1])
        labels = labels.view(-1)
        loss = criterion(output, labels)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    
    
    # Evaluation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            decoder_input = batch["decoder_input"].to(device)
            decoder_mask = batch["decoder_mask"].to(device)
            labels = batch["label"].to(device)


            output = model(decoder_input, decoder_mask)  

            output = output.view(-1, output.shape[-1])
            labels = labels.view(-1)
            loss = criterion(output, labels)

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(test_loader)
    
    print(f"[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    
    # Save model if best
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)
        print("Saved best model.")


[Epoch 0] Validating:  15%|‚ñà‚ñç        | 529/3631 [52:30<5:07:56,  5.96s/it]


KeyboardInterrupt: 