In [15]:
# Import Thư viện
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
import numpy as np
from collections import Counter

In [None]:
# Chuẩn bị dữ liệu

def load_data(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data

def create_label_map(data):
    unique_labels = set()
    for _, labels in data:
        for label in labels:
            unique_labels.add(label)
    label_map = {label: i for i, label in enumerate(unique_labels)}
    return label_map, list(unique_labels)


def encode_labels(labels, label_map):
    encoded_labels = np.zeros(len(label_map), dtype=int)
    for label in labels:
        encoded_labels[label_map[label]] = 1
    return encoded_labels


def build_vocab(data):
    words = []
    for sentence, _ in data:
        words.extend(sentence.lower().split())
    word_counts = Counter(words)
    vocab = {word: i + 2 for i, word in enumerate(word_counts)}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab


def encode_sentence(sentence, vocab):
    tokens = sentence.lower().split()
    encoded = [vocab.get(token, vocab['<unk>']) for token in tokens]
    return encoded

class IntentDataset(Dataset):
    def __init__(self, data, vocab, label_map, max_length):
        self.data = data
        self.vocab = vocab
        self.label_map = label_map
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, labels = self.data[idx]
        encoded_sentence = encode_sentence(sentence, self.vocab)
        padded_sentence = self.pad_sequence(encoded_sentence, self.max_length)
        encoded_labels = encode_labels(labels, self.label_map)
        return {
            'input_ids': torch.tensor(padded_sentence, dtype=torch.long),
            'labels': torch.tensor(encoded_labels, dtype=torch.float)
        }

    def pad_sequence(self, seq, max_length):
        padded = seq[:max_length]
        padding = [self.vocab['<pad>']] * max(0, max_length - len(padded))
        return padded + padding

filepath = 'E://NLP//AS3//Code//all_samples.json'  
data = load_data(filepath)

label_map, all_labels = create_label_map(data)
print(f"Label map: {label_map}")
print(f"All labels: {all_labels}")

vocab = build_vocab(data)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")




Label map: {'attraction-find_attraction': 0, 'bus-find_bus': 1, 'police-find_police': 2, 'train-book_train': 3, 'hotel-book_hotel': 4, 'hotel-find_hotel': 5, 'hospital-find_hospital': 6, 'train-find_train': 7, 'taxi-find_taxi': 8, 'restaurant-book_restaurant': 9, 'restaurant-find_restaurant': 10}
All labels: ['attraction-find_attraction', 'bus-find_bus', 'police-find_police', 'train-book_train', 'hotel-book_hotel', 'hotel-find_hotel', 'hospital-find_hospital', 'train-find_train', 'taxi-find_taxi', 'restaurant-book_restaurant', 'restaurant-find_restaurant']
Vocab size: 7424


In [None]:
from sklearn.model_selection import train_test_split

#chia tập train,test,val

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

max_length = 38 # câu dài nhất là 38 
train_dataset = IntentDataset(train_data, vocab, label_map, max_length)
val_dataset = IntentDataset(val_data, vocab, label_map, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [19]:
#Mô hình LSTM

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_size) # nhân 2 lần lên vì dùng bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        output = self.fc(hidden)
        return output


embedding_dim = 100
hidden_dim = 128
num_layers = 2 # phân loại câu nên 1 đến 2 layer thôi 
output_size = len(label_map)

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_size, num_layers)

optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMClassifier(
  (embedding): Embedding(7424, 100)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=11, bias=True)
)

In [20]:
def evaluate_model(model, dataloader, device, all_labels):
    model.eval()
    all_preds = []
    all_targets = []
    total_loss = 0
    with torch.no_grad():
      for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids)
        # Sử dụng threshold để quyết định nhãn dự đoán. Do một câu có thể có nhiều nhãn nên với mỗi nhãn, xác suất > 0.5 được coi là nhãn đó xuất hiện.
        predictions = (torch.sigmoid(outputs) > 0.5).cpu().numpy() 
        targets = labels.cpu().numpy()
        loss = nn.BCEWithLogitsLoss()(outputs, labels).item()
        total_loss += loss
        all_preds.extend(predictions)
        all_targets.extend(targets)
    
    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    f1 = f1_score(all_targets, all_preds, average='micro')
    avg_loss = total_loss / len(dataloader)

    def get_predicted_labels(predictions, all_labels):
      predicted_labels = []
      for pred in predictions:
        predicted_labels_for_example = []
        for i, label_prob in enumerate(pred):
          if label_prob == 1:
            predicted_labels_for_example.append(all_labels[i])
        predicted_labels.append(predicted_labels_for_example)
      return predicted_labels
    
    predicted_labels = get_predicted_labels(all_preds, all_labels)
    
    print("Sample predictions:")
    for i in range(min(5, len(all_preds))):
        print(f"Predicted Labels: {predicted_labels[i]}")
        print(f"Actual Labels: {get_predicted_labels([all_targets[i]], all_labels)[0]}")

    return f1, avg_loss
    

# Cài early stop dừng lại nếu sau 3 epoch mà kết quả tiến triển không tốt
patience = 3
best_val_loss = float('inf')
trigger_times = 0

epochs = 20  # cài thật lớn epoch và để cho chạy đến khi không cải thiện nữa thì tự dừng nhờ early stop 
criterion = nn.BCEWithLogitsLoss()
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # F1-score
    f1, val_loss = evaluate_model(model, val_dataloader, device, all_labels)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss}, F1: {f1}")

    # Giữ model tốt nhất 
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        trigger_times += 1

    if trigger_times >= patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break
# Đánh giá 
model.load_state_dict(torch.load('best_model.pth'))
f1_score_final, loss_final = evaluate_model(model, val_dataloader, device, all_labels)
print(f"Final F1 score: {f1_score_final}")

Sample predictions:
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Predicted Labels: ['hotel-find_hotel', 'restaurant-find_restaurant']
Actual Labels: ['hotel-find_hotel', 'restaurant-find_restaurant']
Predicted Labels: []
Actual Labels: ['restaurant-find_restaurant']
Predicted Labels: ['hotel-find_hotel']
Actual Labels: ['hotel-find_hotel']
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Epoch 1/20, Training Loss: 0.02808610163629055, Validation Loss: 0.12409220146441301, F1: 0.7713055656413714
Sample predictions:
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Predicted Labels: ['hotel-find_hotel']
Actual Labels: ['hotel-find_hotel', 'restaurant-find_restaurant']
Predicted Labels: []
Actual Labels: ['restaurant-find_restaurant']
Predicted Labels: ['hotel-find_hotel']
Actual Labels: ['hotel-find_hotel']
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Epoch 2/20, Training Loss: 

  model.load_state_dict(torch.load('best_model.pth'))


Sample predictions:
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Predicted Labels: []
Actual Labels: ['hotel-find_hotel', 'restaurant-find_restaurant']
Predicted Labels: ['hotel-book_hotel']
Actual Labels: ['restaurant-find_restaurant']
Predicted Labels: ['hotel-find_hotel']
Actual Labels: ['hotel-find_hotel']
Predicted Labels: ['train-find_train']
Actual Labels: ['train-find_train']
Final F1 score: 0.7944314602174576
