#**Chatbot Training and Response Generation with BART**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
#/content/drive/MyDrive/

Mounted at /content/drive


In [2]:
# !pip install transformers torch pandas

In [3]:
# # !pip install transformers[torch] vncorenlp
# !pip install datasets

In [4]:
# pip install transformers --upgrade

## 1. Importing Libraries and Setting Up the Environment

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartphoTokenizer, BartForConditionalGeneration, AdamW
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import logging
import pandas as pd
from sklearn.decomposition import PCA

In [6]:
# Setup logging
logging.basicConfig(level=logging.INFO)

In [6]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 2. Loading Pretrained BART Model and Tokenizer

In [7]:
# Load BARTpho tokenizer and model
model_name = 'vinai/bartpho-syllable'

try:
    # Use BartphoTokenizer for BARTpho models
    tokenizer = BartphoTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the model: {e}")
    raise e

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt:   0%|          | 0.00/360k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

You are using a model of type mbart to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


## 3. Defining Hyperparameters

In [8]:
# Hyperparameters
batch_size = 8
gradient_accumulation_steps = 4
learning_rate = 5e-5
num_epochs = 6
warmup_steps = 100
max_length = 1024

## 4. Dataset for Demonstration

In [9]:
# Đọc dữ liệu từ file CSV hoặc JSON
df = pd.read_csv('/content/drive/MyDrive/ChatBox/data/processed_medical.csv')  # Thay đổi đường dẫn đến tập dữ liệu của bạn

# Kiểm tra dữ liệu
df.head()




Unnamed: 0,Title,Detailed Content,Reference Link
0,rào_cản tự_nhiên chống lại nhiễm_trùng da,da thường chặn các vi_sinh_vật xâm_nhập trừ kh...,https://www.msdmanuals.com/vi-vn/professional/...
1,rào_cản tự_nhiên chống lại nhiễm_trùng niêm_mạc,nhiều màng_nhầy ngập trong chất tiết có đặc_tí...,https://www.msdmanuals.com/vi-vn/professional/...
2,rào_cản tự_nhiên chống lại nhiễm_trùng đường h...,đường hô_hấp có các hệ_thống lọc đường thở trê...,https://www.msdmanuals.com/vi-vn/professional/...
3,rào_cản tự_nhiên chống lại nhiễm_trùng đường t...,các rào_cản trong đường tiêu_hóa bao_gồm ph ac...,https://www.msdmanuals.com/vi-vn/professional/...
4,rào_cản tự_nhiên chống lại nhiễm_trùng đường s...,các rào_cản về đường sinh_dục tiết_niệu bao_gồ...,https://www.msdmanuals.com/vi-vn/professional/...


In [10]:
df.shape

(11994, 3)

In [11]:
# Data Preparation
titles = df['Title'].tolist()
detailed_contents = df['Detailed Content'].tolist()

In [12]:
# Tokenize toàn bộ titles và detailed_contents
input_encodings = tokenizer(titles, truncation=True, padding=True, return_tensors='pt')
answer_encodings = tokenizer(detailed_contents, truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
# Define a function to check word embeddings
def check_word_embeddings(texts, tokenizer):
    missing_words = set()
    all_words = set()

    # Tokenize each text and collect words
    for text in texts:
        tokens = tokenizer.tokenize(text)
        all_words.update(tokens)

    # Check which words are in the tokenizer's vocabulary
    tokenizer_vocab = tokenizer.get_vocab()
    missing_words = [word for word in all_words if word not in tokenizer_vocab]

    return missing_words


In [14]:
# Check missing words
missing_words = check_word_embeddings(detailed_contents, tokenizer)

# Print results
print("Number of missing words:", len(missing_words))
print("Missing words:", missing_words)

Number of missing words: 1540
Missing words: ['▁knapp', '▁rør', '▁claro', 'ønn', '▁intensiv', '▁agam', '▁rys', '▁efficient', '▁kast', '▁veb', '▁altre', '▁eet', '▁sylw', '▁reso', '▁guida', '▁ryg', 'wiesen', '▁cure', '▁qal', '▁recommended', '▁tros', '▁stk', '▁professionals', '▁advice', '▁symptom', '▁suitable', 'therapie', '▁ủy', '▁defini', '▁halos', '▁ganti', '▁depend', '▁dọa', '▁arter', 'vrh', '▁họa', '▁sken', 'aois', '▁reag', '▁initiat', '▁rö', '▁doba', '▁burro', '▁koning', '▁changing', '▁resolvi', '▁juu', '▁modifier', '▁conversation', '▁heit', '▁presi', '▁molecular', '▁babes', '▁evaluation', '▁ej', 'leider', 'ttinen', '▁seeking', '▁gero', '▁finn', '▁adding', '▁inge', 'propor', '▁trauma', '▁sendi', '▁eventually', 'aliment', '▁nada', 'poja', '▁namna', 'idest', '▁rovin', '▁paradigm', '▁risa', '▁malas', 'hög', '▁valuable', '▁svo', '▁physio', '▁olun', '▁lhe', '▁prea', '▁gila', '▁henne', 'ões', '▁observe', '▁frais', 'evalua', '▁senti', '▁precipit', '▁kena', '▁gravida', 'iñeiro', '▁humans', 

In [15]:
# # Split and tokenize the texts
# def split_text(text, max_length=1024):
#     tokens = tokenizer.encode(text, truncation=False)
#     return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

# split_texts = [split_text(content, max_length=max_length) for content in detailed_contents]
# flat_split_texts = [tokenizer.decode(tokens) for sublist in split_texts for tokens in sublist]

# # Tokenize inputs and labels
# input_encodings = tokenizer(titles, truncation=True, padding=True, return_tensors='pt')
# answer_encodings = tokenizer(flat_split_texts, truncation=True, padding=True, return_tensors='pt')


## 5. Defining the Medical Dataset Class

In [16]:
class MedicalDataset(Dataset):
    def __init__(self, input_encodings, answer_encodings):
        self.input_encodings = input_encodings
        self.answer_encodings = answer_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.input_encodings['input_ids'][idx]
        attention_mask = self.input_encodings['attention_mask'][idx]
        labels = self.answer_encodings['input_ids'][idx]

        # Padding or truncating labels to match input_ids length
        if len(labels) < len(input_ids):
            labels = labels + [0] * (len(input_ids) - len(labels))  # Pad with 0
        elif len(labels) > len(input_ids):
            labels = labels[:len(input_ids)]  # Truncate labels

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [17]:
# Split data into train and validation sets
train_size = int(0.8 * len(input_encodings['input_ids']))
val_size = len(input_encodings['input_ids']) - train_size

train_dataset = MedicalDataset(
    {key: val[:train_size] for key, val in input_encodings.items()},
    {key: val[:train_size] for key, val in answer_encodings.items()}
)

val_dataset = MedicalDataset(
    {key: val[train_size:] for key, val in input_encodings.items()},
    {key: val[train_size:] for key, val in answer_encodings.items()}
)

In [18]:
# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)



## 6. Encoding Questions for Cosine Similarity

In [19]:
# def encode_question(question, tokenizer, model):
#     """
#     Encode a question into a vector using the tokenizer and model.

#     Args:
#         question (str): The question to encode.
#         tokenizer: Tokenizer to use for encoding.
#         model: Model to use for encoding.

#     Returns:
#         np.ndarray: The encoded question vector.
#     """
#     # Tokenize the question
#     inputs = tokenizer(question, return_tensors='pt', truncation=True, padding=True).to(device)

#     # Pass inputs through the model to get the logits
#     with torch.no_grad():  # Disable gradient calculation for faster encoding
#         outputs = model(**inputs)

#     # Use 'logits' to get the output embeddings
#     logits = outputs.logits  # Shape: [batch_size, seq_len, vocab_size]

#     # Mean pooling to get sentence embedding
#     sentence_embedding = torch.mean(logits, dim=1).squeeze().detach().cpu().numpy()

#     return sentence_embedding

## 7. Calculating Cosine Similarity

In [20]:
# def calculate_cosine_similarity(question, model_questions, tokenizer, model):
#     question_embedding = encode_question(question, tokenizer, model)
#     question_embeddings = np.array([encode_question(q, tokenizer, model) for q in model_questions])

#     similarities = cosine_similarity([question_embedding], question_embeddings)
#     top_indices = np.argsort(similarities[0])[::-1]  # Sort in descending order

#     return top_indices[:3], similarities[0][top_indices[:3]]

## 8. Generating Answers Using BART

In [21]:
# def generate_answer(question, model, tokenizer, max_length=300, num_return_sequences=3):
#     """
#     Generate a refined answer for the most similar question.

#     Args:
#         question (str): Most similar question.
#         model: Trained BART model.
#         tokenizer: Tokenizer for BART.
#         max_length (int, optional): Maximum length of generated answer. Defaults to 300.
#         num_return_sequences (int, optional): Number of answer sequences to generate. Defaults to 3.

#     Returns:
#         list: List of generated answers.
#     """
#     inputs = tokenizer(question, return_tensors='pt').to(device)

#     outputs = model.generate(
#         inputs['input_ids'],
#         max_length=200,
#         num_return_sequences=num_return_sequences,
#         do_sample=True,
#         top_k=50,
#         top_p=0.9,
#         temperature=1.0,
#         early_stopping=True
#     )

#     return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

## 9. Generating Responses Based on Similarity Scores

In [22]:
# def generate_response(input_text, model, tokenizer, model_questions, model_answers):
#     """
#     Generate a response for the input text using cosine similarity.

#     Args:
#         input_text (str): Input text for generating the response.
#         model: Trained BART model.
#         tokenizer: Tokenizer for BART.
#         model_questions (list): List of questions from the model's training data.
#         model_answers (list): List of answers corresponding to the model's questions.

#     Returns:
#         str: The generated response by the Chatbot.
#     """
#     # Tính toán cosine similarity
#     top_indices, similarities = calculate_cosine_similarity(input_text, model_questions, tokenizer, model)

#     print(f"User Question: {input_text}")
#     print(f"Top Indices: {top_indices}")
#     print(f"Similarities: {similarities}")
#     print(f"Model Questions Length: {len(model_questions)}")
#     print(f"Model Answers Length: {len(model_answers)}")

#     responses = []
#     for idx in top_indices:
#         # Kiểm tra chỉ số để đảm bảo không vượt quá kích thước mảng
#         if 0 <= idx < len(model_questions) and 0 <= idx < len(model_answers):
#             similar_question = model_questions[idx]
#             corresponding_answer = model_answers[idx]
#             print(f"\nSimilar Question: {similar_question} (Similarity: {similarities[idx]:.2f})")
#             print(f"Answer: {corresponding_answer}")

#             # Sinh câu trả lời từ câu hỏi tương tự
#             refined_answers = generate_answer(similar_question, model, tokenizer)
#             responses.append((similar_question, corresponding_answer, refined_answers))
#         else:
#             print(f"Index {idx} is out of bounds for model questions or answers.")

#     return responses

## **10. Training the Model**

In [26]:
# Training Function
def train_bart_chatbot(model, tokenizer, train_loader, val_loader, optimizer, num_epochs=3, gradient_accumulation_steps=4):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0

        epoch_progress_bar = tqdm(enumerate(train_loader), total=len(train_loader),
                                  desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

        for step, batch in epoch_progress_bar:
            try:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)

                loss = outputs.loss
                total_loss += loss.item()

                loss = loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                epoch_progress_bar.set_postfix({"Loss": total_loss / (step + 1)})

            except IndexError as e:
                print(f"IndexError during training step {step}: {e}")
                raise e

        avg_loss = total_loss / len(train_loader)
        logging.info(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")

        # Validation
        validate_bart_chatbot(model, val_loader, device)

In [27]:
def validate_bart_chatbot(model, val_loader, device):
    """
    Validate the Chatbot using the BART model.

    Args:
        model: BART model
        val_loader: DataLoader for validation data
        device: Device to run the model on ('cpu' or 'cuda')
    """
    model.eval()
    total_val_loss = 0
    num_batches = len(val_loader)

    with torch.no_grad():
        # Use tqdm to show progress bar
        progress_bar = tqdm(val_loader, desc="Validation", total=num_batches, leave=False)

        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**inputs)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Update progress bar with current batch loss
            progress_bar.set_postfix({'Batch Loss': loss.item()})

    avg_val_loss = total_val_loss / num_batches
    # Log the average validation loss
    logging.info(f"Validation Loss: {avg_val_loss:.4f}")

    return avg_val_loss

## 11. Start Training

In [28]:
# Train the model
train_bart_chatbot(model, tokenizer, train_loader, val_loader, optimizer, num_epochs=num_epochs)

  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
  'labels': torch.tensor(labels, dtype=torch.long)


## 12.Load the Fine-Tuned Model

In [29]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')
tokenizer.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')

Non-default generation parameters: {'forced_eos_token_id': 2}


('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/tokenizer_config.json',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/special_tokens_map.json',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/sentencepiece.bpe.model',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/dict.txt',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/added_tokens.json')

In [None]:

# # Plot the performance
# plot_performance(batch_losses, epoch_losses)

## 13. Example: Generating a Response to a User's Question

In [32]:
def generate_response1(input_text: str) -> str:
    """
    Generate a response for the input text.

    Args:
        input_text (str): Input text for generating the response.
        model: Pre-trained BART model.
        tokenizer: Tokenizer for BART.
        device (str): Device to run the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
        max_length (int): Maximum length of the generated response. Defaults to 1024.

    Returns:
        str: The generated response by the Chatbot.
    """
    # Tokenize input text and move tensors to device
    input_ids = tokenizer.encode(input_text, return_tensors='pt',
                                  truncation=True, padding='max_length')
    input_ids = input_ids.to(device)
    print("Tokenized Input IDs:", input_ids)

    # Generate response using the model
    with torch.no_grad():
        output = model.generate(input_ids,
                        max_length=max_length,
                        do_sample=True,
                        temperature=0.7,  # A value between 0.7 and 1.0 can work well
                        no_repeat_ngram_size=1,
                        early_stopping=True)
        # output = model.generate(input_ids,
        #                 max_length=max_length,
        #                 do_sample=True,  # Allow sampling to introduce randomness
        #                 top_k=50,  # Consider the top 50 words
        #                 no_repeat_ngram_size=2,
        #                 early_stopping=True)
        # output = model.generate(input_ids,
        #                 max_length=max_length,
        #                 do_sample=True,
        #                 top_p=0.92,  # Consider top-p words where p is a cumulative probability
        #                 no_repeat_ngram_size=2,
        #                 early_stopping=True)

    # Decode the generated response
    response = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return response

In [33]:
# Example of generating a response
user_question = "trẻ em bị lao phổi"
response = generate_response1(user_question)
print(f"Generated Response: {response}")

Tokenized Input IDs: tensor([[   0,  318,  293,   46,  565,   63, 1140,   40,    2]])
Generated Response: h vànng làmạ do thườngđdchứthsu  có được cáctr ở thuốcyiệugể baob sau bệnh với choi nhiễmlcị tăng 1ộthuhântố khôngương triệu hoặc củav sự nguyênphảa n điều một trong bằngồ khi nhữngật chx đếnánầo sửp bị trẻà rối and m trêniện xươngụ thầnơ nếu cơẩ máu ra gây 2 đặcnhệ sinh làmệtquequakin dấu đau này inbi ngườiường viêmốngá nhưngápấ khớpgi giảm phải huyếthế sốổ phổ alý thiếuọ đường víỏ khácọcịnhến loạiíchhó tổnti cần như hơnợ để phẫuư dượng nướcênloếtátắ j đánh dùng daiệm hộikkhiức kiểm thực theosin lâm đầu từ x chỉ phân sới l thận xem of xét kết phát trướcùngử bấtiển cao thông biểu việc nên tếộcăng về 10 khám đôi –ĩ dịch the nhiều c chất tính cấp mô cáchtín thay đối vếtực vào xuất chống mất mắtạch não liên màng nhấtịch 3 b bàn tử tìnhal vùng biến quá tuổiình g chủối hầuco alậnâm áp gặp đó thời10 hạviề chứngạc xác bên bởi kích v 101 fortaymáạoục tại tiền thở tuyếnế thai giải phụthi ti

In [None]:
# # Example user question
# user_question = "dấu hiệu của nhiễm trùng"

# # Generate response
# responses = generate_response(user_question, model, tokenizer, titles, detailed_contents)

# print(responses)

# # # Print generated responses
# # for idx, (similar_question, corresponding_answer, refined_answers) in enumerate(responses):
# #     print(f"\nResponse {idx + 1}:")
# #     print(f"Similar Question: {similar_question}")
# #     print(f"Original Answer: {corresponding_answer}")
# #     print(f"Refined Answer: {refined_answers}")

#Use the Fine-Tuned Model in the Chatbox

In [3]:
# Use AutoTokenizer to ensure compatibility with BARTPho
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')
tokenizer = BartphoTokenizer.from_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')