In [1]:
!pip install torchtext==0.16.0

Collecting torchtext==0.16.0
  Downloading torchtext-0.16.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.5 kB)
Collecting torch==2.1.0 (from torchtext==0.16.0)
  Downloading torch-2.1.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.0 (from torchtext==0.16.0)
  Downloading torchdata-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.0->torchtext==0.16.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.1.0->torchtext==0.16.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.1.0->torchtext==0.16.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.1.0->torcht

In [8]:
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
from torchtext.vocab import vocab

In [9]:
# Tải dữ liệu cần thiết của nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[

True

In [10]:
# Tiền xử lý dữ liệu
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

In [11]:
# Đọc dữ liệu từ file CSV
def load_data(csv_file):
    df = pd.read_csv(csv_file)
    df.dropna(inplace=True)
    questions = df['question'].tolist()
    answers = df['answer'].tolist()
    return questions, answers

In [12]:
# Load dữ liệu từ file CSV
csv_file = "qa.csv"
questions, answers = load_data(csv_file)

In [13]:
# Tiền xử lý câu hỏi
processed_questions = [preprocess_text(q) for q in questions]


In [14]:
# Tạo từ điển vocab
def build_vocab(tokenized_texts):
    word_freq = Counter([word for text in tokenized_texts for word in text])
    word_vocab = vocab(word_freq, specials=["<pad>", "<unk>"])
    word_vocab.set_default_index(word_vocab["<unk>"])
    return word_vocab

In [15]:
word_vocab = build_vocab(processed_questions)

In [19]:
# Chuyển câu hỏi thành vector số
def numericalize(text, vocab):
    return [vocab[word] for word in text]

In [18]:
numerical_questions = [numericalize(q, word_vocab) for q in processed_questions]

In [20]:
# Padding dữ liệu
def pad_sequences(sequences, max_len):
    return [seq + [word_vocab["<pad>"]] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]


In [21]:
max_len = max(len(q) for q in numerical_questions)
numerical_questions = pad_sequences(numerical_questions, max_len)
numerical_questions = torch.tensor(numerical_questions)


In [22]:
max_len

98

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_len, dropout=0.1):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word_vocab["<pad>"])
        self.positional_encoding = self._generate_positional_encoding(max_len, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        self.decoder = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def _generate_positional_encoding(self, max_len, embed_dim):
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # (1, max_len, embed_dim)

    def forward(self, x, src_mask=None):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.dropout(x)
        x = self.encoder(x, src_mask)
        return self.decoder(x)

    def generate_text(self, x, max_len=10):
        outputs = []
        for _ in range(max_len):
            x = self.forward(x).argmax(dim=-1)
            outputs.append(x)
            if (x == word_vocab["<pad>"]).all():
                break  # Dừng nếu toàn bộ câu là <pad>
        return torch.cat(outputs, dim=1)  # (batch_size, max_len)


In [25]:
# Tạo mô hình
vocab_size = len(word_vocab)
model = SimpleTransformer(vocab_size, embed_dim=64, num_heads=4, hidden_dim=128, num_layers=2, max_len=max_len)



In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

def train_model(model, questions, answers, vocab, epochs=100, lr=0.001, patience=10):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)  # Thêm weight decay
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

    best_loss = float('inf')
    counter = 0  # Biến đếm cho early stopping

    for epoch in range(epochs):
        total_loss = 0
        model.train()  # Đặt mô hình ở chế độ train

        for q, a in zip(questions, answers):
            q_tensor = torch.tensor(q).unsqueeze(0)  # Thêm batch dimension
            a_tensor = torch.tensor(a).unsqueeze(0)

            optimizer.zero_grad()
            output = model(q_tensor)  # Dự đoán
            loss = loss_fn(output.view(-1, len(vocab)), a_tensor.view(-1))  # Tính loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)  # Gradient Clipping
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(questions)
        scheduler.step(avg_loss)  # Giảm learning rate nếu cần

        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

        # Early Stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

# Tiền xử lý câu trả lời thành số
processed_answers = [preprocess_text(a) for a in answers]
numerical_answers = [numericalize(a, word_vocab) for a in processed_answers]
numerical_answers = pad_sequences(numerical_answers, max_len)
numerical_answers = torch.tensor(numerical_answers)

# Huấn luyện mô hình
train_model(model, numerical_questions, numerical_answers, word_vocab, epochs=200)

  q_tensor = torch.tensor(q).unsqueeze(0)  # Thêm batch dimension
  a_tensor = torch.tensor(a).unsqueeze(0)


Epoch 1, Loss: 5.5709
Epoch 2, Loss: 5.0746
Epoch 3, Loss: 4.9828
Epoch 4, Loss: 4.9167
Epoch 5, Loss: 4.8827
Epoch 6, Loss: 4.8495
Epoch 7, Loss: 4.8323
Epoch 8, Loss: 4.8138
Epoch 9, Loss: 4.7990
Epoch 10, Loss: 4.7816
Epoch 11, Loss: 4.7673
Epoch 12, Loss: 4.7550
Epoch 13, Loss: 4.7510
Epoch 14, Loss: 4.7487
Epoch 15, Loss: 4.7472
Epoch 16, Loss: 4.7312
Epoch 17, Loss: 4.7396
Epoch 18, Loss: 4.7285
Epoch 19, Loss: 4.7045
Epoch 20, Loss: 4.7039
Epoch 21, Loss: 4.7030
Epoch 22, Loss: 4.6953
Epoch 23, Loss: 4.6848
Epoch 24, Loss: 4.6941
Epoch 25, Loss: 4.6737
Epoch 26, Loss: 4.7169
Epoch 27, Loss: 4.6850
Epoch 28, Loss: 4.6987
Epoch 29, Loss: 4.7107
Epoch 30, Loss: 4.6857
Epoch 00031: reducing learning rate of group 0 to 5.0000e-04.
Epoch 31, Loss: 4.7581
Epoch 32, Loss: 4.7657
Epoch 33, Loss: 5.0908
Epoch 34, Loss: 5.0413
Epoch 35, Loss: 5.0503
Early stopping at epoch 35


In [17]:
def get_answer(question, top_k=3, threshold=0.5):
    processed_q = preprocess_text(question)
    q_vector = numericalize(processed_q, word_vocab)
    q_vector = pad_sequences([q_vector], max_len)
    q_vector = torch.tensor(q_vector)

    # Tính cosine similarity
    similarities = cosine_similarity(q_vector.numpy(), numerical_questions.numpy())
    top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]

    # Nếu có câu hỏi giống trên ngưỡng threshold, lấy câu trả lời từ dữ liệu
    best_idx = top_k_indices[0]
    if similarities[0][best_idx] >= threshold:
        best_answer = answers[best_idx]
        similar_questions = [questions[idx] for idx in top_k_indices]
    else:
        # Dùng mô hình Transformer để sinh câu trả lời
        with torch.no_grad():
            output = model(q_vector).squeeze(0)  # (max_len, vocab_size)
            predicted_indices = output.argmax(dim=1).tolist()
            best_answer = " ".join([word for word, idx in word_vocab.get_stoi().items() if idx in predicted_indices])

        # Dùng mô hình Transformer để tự sinh 3 câu hỏi tương tự
        with torch.no_grad():
            generated_questions = model.generate_text(q_vector, max_len=10)
            similar_questions = []
            for gen_q in generated_questions:
                text = " ".join([word for word, idx in word_vocab.get_stoi().items() if idx in gen_q.tolist()])
                similar_questions.append(text)

    return best_answer, similar_questions[:3]  # Trả về 3 câu hỏi tự sinh


In [21]:
answer, similar_questions = get_answer("Who can rent a vehicle for self-driving?")
print("Câu trả lời:", answer)
print("3 câu hỏi tương tự tự sinh:", similar_questions)

Câu trả lời: Only individuals with a valid driving license with remaining points can rent a self-drive vehicle.
3 câu hỏi tương tự tự sinh: ['Who can rent a vehicle for self-driving?', 'What should I do if I am involved in an accident?', 'What is an internal road?']


In [19]:
# Lưu mô hình
model_path = "simple_transformer.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Model saved to simple_transformer.pth
