#**Chatbot Training and Response Generation with BART**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#/content/drive/MyDrive/

Mounted at /content/drive


In [None]:
# !pip install transformers torch pandas

In [None]:
# # !pip install transformers[torch] vncorenlp
# !pip install datasets

In [None]:
# pip install transformers --upgrade

## 1. Importing Libraries and Setting Up the Environment

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartphoTokenizer, BartForConditionalGeneration, AdamW
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import logging
import pandas as pd
from sklearn.decomposition import PCA
import tensorflow as tf
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

#import for cbow
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# device = torch.device("cpu")

## 2. Loading Pretrained BART Model and Tokenizer

In [None]:
# Load BARTpho tokenizer and model
model_name = 'vinai/bartpho-syllable'

try:
    # Use BartphoTokenizer for BARTpho models
    tokenizer = BartphoTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the model: {e}")
    raise e

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt:   0%|          | 0.00/360k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

You are using a model of type mbart to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


## 3. Defining Hyperparameters

In [None]:
# Hyperparameters
batch_size = 16
gradient_accumulation_steps = 4
learning_rate = 5e-5
num_epochs = 6
warmup_steps = 100
max_length = 1024

## 4. Dataset for Demonstration

In [None]:
# Đọc dữ liệu từ file CSV hoặc JSON
df = pd.read_csv('/content/drive/MyDrive/ChatBox/data/processed_medical.csv')  # Thay đổi đường dẫn đến tập dữ liệu của bạn

# Kiểm tra dữ liệu
df.head()

#Dùng tf-idf giảm những ý không quan trọng


Unnamed: 0,Title,Detailed Content,Reference Link
0,rào_cản tự_nhiên chống lại nhiễm_trùng về da,da thường chặn các vi_sinh_vật xâm_nhập trừ kh...,https://www.msdmanuals.com/vi-vn/professional/...
1,rào_cản tự_nhiên chống lại nhiễm_trùng về niêm...,nhiều màng_nhầy ngập trong chất tiết có đặc_tí...,https://www.msdmanuals.com/vi-vn/professional/...
2,rào_cản tự_nhiên chống lại nhiễm_trùng về đườn...,đường hô_hấp có các hệ_thống lọc đường thở trê...,https://www.msdmanuals.com/vi-vn/professional/...
3,rào_cản tự_nhiên chống lại nhiễm_trùng về đườn...,các rào_cản trong đường tiêu_hóa bao_gồm ph ac...,https://www.msdmanuals.com/vi-vn/professional/...
4,rào_cản tự_nhiên chống lại nhiễm_trùng về đườn...,các rào_cản về đường sinh_dục tiết_niệu bao_gồ...,https://www.msdmanuals.com/vi-vn/professional/...


In [None]:
df.shape

(12488, 3)

# *Data Preparation*

In [None]:
# Data Preparation
titles = df['Title'].tolist()
detailed_contents = df['Detailed Content'].tolist()

# Đảm bảo tất cả các phần tử trong danh sách là chuỗi
titles = [str(title) for title in titles]
detailed_contents = [str(content) for content in detailed_contents]

In [None]:
import re

# Hàm tìm từ ghép có dấu '_'
def find_compound_words(texts):
    compound_words = set()
    for text in texts:
        # Tìm các từ chứa dấu '_'
        matches = re.findall(r'\b\w+_\w+\b', text)
        compound_words.update(matches)
    return list(compound_words)

# Tìm từ ghép trong titles và detailed_contents
compound_words_titles = find_compound_words(titles)
compound_words_contents = find_compound_words(detailed_contents)

# Kết hợp từ ghép từ cả titles và detailed_contents và đảm bảo tính duy nhất
compound_words = list(set(compound_words_titles + compound_words_contents))

# Kiểm tra số lượng từ ghép đã tìm thấy và các từ là duy nhất
print(f"Number of unique compound words: {len(compound_words)}")
print(compound_words[:10])  # In ra 10 từ ghép đầu tiên để kiểm tra

Number of unique compound words: 10739
['bệnh_nhân_thể_trạng', 'huyết_thanh_thế_hệ', 'phụ_huynh', 'phổi_tạng', 'anion_âm', 'kim_dụng_cụ', 'thuyết_phục', 'đầu_cuối', 'cây_cỏ', 'hạ_sốt']


In [None]:
num_added_tokens = tokenizer.add_tokens(compound_words)

model.resize_token_embeddings(len(tokenizer))

print(f"Added {num_added_tokens} tokens. New vocab size: {len(tokenizer)}")

Added 10739 tokens. New vocab size: 50769


In [None]:
test_sentence = "rào_cản tự_nhiên chống lại nhiễm_trùng da"
tokens = tokenizer.tokenize(test_sentence)
print(tokens)

['rào_cản', 'tự_nhiên', '▁chống', '▁lại', 'nhiễm_trùng', '▁da']


### a. Khởi Tạo Keras Tokenizer

### b. N-GRAM


In [None]:
# !pip install pyvi

### Khởi tạo tokens

In [None]:
def tokenize_data(titles, contents, tokenizer, title_max_length, content_max_length):
    def process_in_batches(texts, tokenizer, batch_size, max_length):
        encodings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]

            batch_encodings = tokenizer(
                batch_texts,
                truncation=True,
                padding='max_length',
                max_length=max_length,
                return_tensors='pt'
            )
            encodings.append(batch_encodings)

        # Kết hợp tất cả encodings từ các batch
        combined_encodings = {key: torch.cat([batch[key] for batch in encodings], dim=0) for key in encodings[0]}
        return combined_encodings

    # Tokenize titles and contents
    input_encodings = process_in_batches(titles, tokenizer, batch_size=32, max_length=title_max_length)
    answer_encodings = process_in_batches(contents, tokenizer, batch_size=32, max_length=content_max_length)

    return input_encodings, answer_encodings


In [None]:
# Tokenize the data
title_max_length = 60
content_max_length = 950
input_encodings, answer_encodings = tokenize_data(titles, detailed_contents, tokenizer, title_max_length, content_max_length)

In [None]:
# Tính độ dài của mỗi câu trong input_encodings dựa trên attention_mask
input_lengths = input_encodings['attention_mask'].sum(dim=1)
input_max_length = input_lengths.max().item()

# Tính độ dài của mỗi câu trong answer_encodings dựa trên attention_mask
answer_lengths = answer_encodings['attention_mask'].sum(dim=1)
answer_max_length = answer_lengths.max().item()

print(f"Đoạn dài nhất trong input_encodings sau khi loại bỏ <pad> có kích thước {input_max_length} token.")
print(f"Đoạn dài nhất trong answer_encodings sau khi loại bỏ <pad> có kích thước {answer_max_length} token.")

Đoạn dài nhất trong input_encodings sau khi loại bỏ <pad> có kích thước 57 token.
Đoạn dài nhất trong answer_encodings sau khi loại bỏ <pad> có kích thước 945 token.


In [None]:
# Hàm để kiểm tra tokenization
def check_tokenization(texts, encodings, tokenizer):
    for i in range(min(3, len(texts))):  # Chỉ in ra 3 ví dụ đầu tiên
        original_text = texts[i]
        token_ids = encodings['input_ids'][i]
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
        decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
        # print(token_ids)
        print(f"Original Text: {original_text}")
        print(f"Tokens: {tokens}")
        print(f"Decoded Text: {decoded_text}")
        print("="*50)

# Kiểm tra tokenization cho titles và contents
check_tokenization(titles, input_encodings, tokenizer)
check_tokenization(detailed_contents, answer_encodings, tokenizer)

Original Text: rào_cản tự_nhiên chống lại nhiễm_trùng về da
Tokens: ['<s>', 'rào_cản', 'tự_nhiên', '▁chống', '▁lại', 'nhiễm_trùng', '▁về', '▁da', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Decoded Text: rào_cản tự_nhiên chống lại nhiễm_trùng về da
Original Text: rào_cản tự_nhiên chống lại nhiễm_trùng về niêm_mạc
Tokens: ['<s>', 'rào_cản', 'tự_nhiên', '▁chống', '▁lại', 'nhiễm_trùng', '▁về', 'niêm_mạc', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

## Check word embeding

In [None]:
# Define a function to check word embeddings
def check_word_embeddings(texts, tokenizer):
    missing_words = set()
    all_words = set()

    # Tokenize each text and collect words
    for text in texts:
        # Ensure text is a string before tokenization
        if not isinstance(text, str):
            text = str(text)  # Convert to string if not already
        tokens = tokenizer.tokenize(text)
        all_words.update(tokens)

    # Check which words are in the tokenizer's vocabulary
    tokenizer_vocab = tokenizer.get_vocab()
    missing_words = [word for word in all_words if word not in tokenizer_vocab]

    return missing_words

In [None]:
# Check missing words
missing_words = check_word_embeddings(detailed_contents, tokenizer)

# Print results
print("Number of missing words:", len(missing_words))
print("Missing words:", missing_words)

Number of missing words: 524
Missing words: ['▁licence', '▁ο', '▁amalga', '▁reta', '▁apparent', '▁cred', '▁zile', 'nadh', '▁vibr', '▁karta', '▁moja', '‡', '▁fonda', '▁kasaba', '▁aire', '▁gust', '▁stim', '▁pct', '▁efe', 'pastu', '▁crit', '▁sporo', '▁nih', '▁coordinat', '▁dib', '▁previa', '▁corporis', '▁federa', '▁tosi', '▁hominis', '▁lhe', '▁tòa', '▁chronic', '▁oly', '▁bence', '▁iglesia', 'ǂ', '▁infection', '▁modifier', '▁arter', '▁henne', '▁criza', '▁piv', '▁lyser', '▁lyst', '▁itt', '▁prostat', '▁dental', '▁chuva', 'adolescent', '▁apne', '▁mero', '▁ryg', '▁haj', '▁physio', '▁sensitive', '▁minimal', '▁prognos', '▁stok', '▁comprehensive', '▁dọa', '▁institut', '▁ym', '▁rendu', '▁fungo', '▁sjö', '▁ganti', '▁fy', '▁urin', '▁durable', 'lusta', '▁xist', '▁chiar', '▁neer', 'neho', '▁nepri', '▁dhe', '▁rendel', '▁buni', '▁bied', '▁manifestation', '▁bordet', '▁occupa', '▁snelle', '▁hazard', '▁treated', '▁hav', '▁hỏa', '▁vim', 'qol', '▁teraz', '▁renal', '▁medication', '▁lait', '▁psychiatr', '▁lak'

## 5. Defining the Medical Dataset Class

In [None]:
class MedicalDataset(Dataset):
    def __init__(self, input_encodings, answer_encodings):
        self.input_encodings = input_encodings
        self.answer_encodings = answer_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.input_encodings['input_ids'][idx]
        attention_mask = self.input_encodings['attention_mask'][idx]
        labels = self.answer_encodings['input_ids'][idx]

        # Padding hoặc cắt ngắn labels để khớp với kích thước input_ids
        if len(labels) < len(input_ids):
            labels = labels + [0] * (len(input_ids) - len(labels))  # Padding
        elif len(labels) > len(input_ids):
            labels = labels[:len(input_ids)]  # Cắt ngắn

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


In [None]:
# Shuffle dữ liệu trước khi chia
def shuffle_data(input_encodings, answer_encodings):
    perm = torch.randperm(len(input_encodings['input_ids']))
    for key in input_encodings.keys():
        input_encodings[key] = input_encodings[key][perm]
        answer_encodings[key] = answer_encodings[key][perm]
    return input_encodings, answer_encodings

# Shuffle dữ liệu
input_encodings, answer_encodings = shuffle_data(input_encodings, answer_encodings)

# Chia dữ liệu thành tập train và validation
train_size = int(0.8 * len(input_encodings['input_ids']))
val_size = len(input_encodings['input_ids']) - train_size

train_dataset = MedicalDataset(
    {key: val[:train_size] for key, val in input_encodings.items()},
    {key: val[:train_size] for key, val in answer_encodings.items()}
)

val_dataset = MedicalDataset(
    {key: val[train_size:] for key, val in input_encodings.items()},
    {key: val[train_size:] for key, val in answer_encodings.items()}
)

# Bạn có thể kiểm tra kích thước của train và val dataset
print(f"Kích thước tập huấn luyện: {len(train_dataset)}")
print(f"Kích thước tập kiểm tra: {len(val_dataset)}")

Kích thước tập huấn luyện: 9990
Kích thước tập kiểm tra: 2498


### Dataloaders

In [None]:
# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)



## **10. Training the Model**

In [None]:
def train_bart_chatbot(model, tokenizer, train_loader, val_loader, optimizer, num_epochs=3, gradient_accumulation_steps=4):
    best_val_loss = float('inf')
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        epoch_progress_bar = tqdm(enumerate(train_loader), total=len(train_loader),
                                  desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

        for step, batch in epoch_progress_bar:
            try:
                inputs = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**inputs)

                loss = outputs.loss
                total_loss += loss.item()

                loss = loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                epoch_progress_bar.set_postfix({"Loss": total_loss / (step + 1)})

            except IndexError as e:
                print(f"IndexError during training step {step}: {e}")
                raise e

        avg_loss = total_loss / len(train_loader)
        logging.info(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")

        # Validation
        model.eval()
        # Validation after each epoch
        val_loss = validate_bart_chatbot_during_training(model, val_loader, device)
        model.train()

        # Save the best model based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_bart_model.pt")


In [None]:
def validate_bart_chatbot_during_training(model, val_loader, device):
    model.eval()
    total_loss = 0

    for batch in tqdm(val_loader, desc="Validating"):
        with torch.no_grad():
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()

    avg_val_loss = total_loss / len(val_loader)
    logging.info(f"Validation Loss: {avg_val_loss}")

    return avg_val_loss

In [None]:
def exact_match(prediction, ground_truth):
    return prediction.strip() == ground_truth.strip()

def f1(prediction, ground_truth):
    pred_tokens = prediction.split()
    gt_tokens = ground_truth.split()

    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0.0

    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
def validate_bart_chatbot_after_training(model, tokenizer, val_loader, device):
    model.eval()
    total_loss = 0
    em_score = 0
    f1_scores = []

    for batch in tqdm(val_loader, desc="Final Evaluation"):
        with torch.no_grad():
            # Ensure all tensors are on the correct device
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()

            # Use the customized generation settings
            predictions = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=answer_max_length,  # or answer_max_length if defined
                do_sample=True,
                top_k=50,
                top_p=0.92,
                temperature=0.7,
                no_repeat_ngram_size=1,
                early_stopping=True
            )

            # Move predictions and labels to CPU if they are on GPU
            decoded_preds = [tokenizer.decode(g.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in predictions]
            decoded_labels = [tokenizer.decode(g.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in inputs['labels']]

            for pred, label in zip(decoded_preds, decoded_labels):
                print(f"Prediction: {pred}")
                print(f"Label: {label}")
                em_score += exact_match(pred, label)
                f1_scores.append(f1(pred, label))

    avg_val_loss = total_loss / len(val_loader)
    avg_em_score = em_score / len(val_loader.dataset)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    logging.info(f"Final Validation Loss: {avg_val_loss}")
    logging.info(f"Exact Match: {avg_em_score}")
    logging.info(f"F1 Score: {avg_f1_score}")

    return avg_val_loss, avg_em_score, avg_f1_score

## 11. Start Training

In [None]:
# Train the model
train_bart_chatbot(model, tokenizer, train_loader, val_loader, optimizer, num_epochs=num_epochs)

  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
  'labels': torch.tensor(labels, dtype=torch.long)
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.12it/s]
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.08it/s]
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]
Validating: 100%|██████████| 157/157 [00:38<00:00,  4.09it/s]


In [None]:
# final_val_loss, final_em_score, final_f1_score = validate_bart_chatbot_after_training(model, tokenizer, val_loader, device)
# logging.info(f"Final Exact Match: {final_em_score}")
# logging.info(f"Final F1 Score: {final_f1_score}")

In [None]:
# final_val_loss, final_em_score, final_f1_score = validate_bart_chatbot_after_training(model, tokenizer, val_loader, device)
# logging.info(f"Final Exact Match: {final_em_score}")
# logging.info(f"Final F1 Score: {final_f1_score}")

In [None]:
# print(f"Final Exact Match: {final_em_score}")
# print(f"Final final_val_loss: {final_val_loss}")
# print(f"Final F1 Score: {final_f1_score}")

## 12.Load the Fine-Tuned Model

In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')
tokenizer.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')

Non-default generation parameters: {'forced_eos_token_id': 2}


('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/tokenizer_config.json',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/special_tokens_map.json',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/sentencepiece.bpe.model',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/dict.txt',
 '/content/drive/MyDrive/ChatBox/model/fine_tuned_bart/added_tokens.json')

## 13. Example: Generating a Response to a User's Question

In [None]:
def generate_responses(input_text: str, num_responses=5) -> None:
    """
    Generate and print multiple responses for the input text using a pre-trained model and tokenizer.

    Args:
        input_text (str): Input text for generating responses.
        model: Pre-trained BART model.
        tokenizer: Tokenizer for BART.
        device (str): Device to run the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
        max_length (int): Maximum length of the generated response. Defaults to 1024.
        num_responses (int): Number of responses to generate. Defaults to 5.
    """
    # Tokenize input text and move tensors to device
    inputs = tokenizer.encode_plus(input_text, return_tensors='pt',
                                  truncation=True, max_length=50)
    input_ids = inputs["input_ids"].to(device) # Use the device argument directly
    attention_mask = inputs["attention_mask"].to(device)

    # Move the model to the device
    model.to(device)

    # Generate response using the model
    with torch.no_grad():
         output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=200,
            num_beams=5,  # Use beam search
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    # Decode the generated response
    response = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return response

In [None]:
# Example
user_question = "rào_cản tự_nhiên chống lại nhiễm_trùng về niêm_mạc"
response = generate_responses(user_question, 3)
print(f"Generated Response: {response}")

Generated Response: các và được có có_thể hoặc là trong của ở không thuốc bệnh_nhân bệnh khi một những với  đến thường do ví_dụ cho viêm ra điều_trị máu giảm để sự nếuy sau tăng sử_dụng bị gây bao_gồm i rối_loạn l khác hơn


#Use the Fine-Tuned Model in the Chatbox

In [None]:
# Use AutoTokenizer to ensure compatibility with BARTPho
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')
tokenizer = BartphoTokenizer.from_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')