<a href="https://colab.research.google.com/github/TuliDas/Healthcare-Customer-Support-ChatBot/blob/main/healthcare__chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Tools**
Note: For text generation, alwyas use T5

In [5]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


In [6]:
df = pd.read_csv("/content/domain_specific_chatbot_data.csv")
df.head()

Unnamed: 0,query,response,intent,domain
0,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
2,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
3,How can I check my account balance?,You can check your balance by logging into you...,balance inquiry,finance
4,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance


**Data Preprocessing**

In [7]:
df['query'][0]

'What are the side effects of the COVID-19 vaccine?'

In [8]:
df['response'][0]

'Common side effects of the COVID-19 vaccine include soreness at the injection site, fever, and fatigue.'

In [9]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2) # 80% data on train set, 20% test set
train_df.shape, val_df.shape

((2400, 4), (600, 4))

In [10]:
train_df
# indexing are not starting from zero

Unnamed: 0,query,response,intent,domain
1679,How do I apply for a student loan?,You can apply for a student loan by visiting o...,student loan application,finance
2073,How can I check my account balance?,You can check your balance by logging into you...,balance inquiry,finance
2292,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
74,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance
2215,"I lost my credit card, what should I do?",Please contact our customer service immediatel...,lost card reporting,finance
...,...,...,...,...
540,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1148,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance
2056,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
1567,How do I update my contact details on my account?,"To update your contact details, log into your ...",contact update,finance


In [13]:
# make indexing starts from zero

train_data = train_df.reset_index(drop=True)
validation_data = val_df.reset_index(drop=True)
validation_data

Unnamed: 0,query,response,intent,domain
0,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
1,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance
2,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance
3,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
4,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
...,...,...,...,...
595,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
596,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance
597,"I lost my credit card, what should I do?",Please contact our customer service immediatel...,lost card reporting,finance
598,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance


In [14]:
# cleaning text
import re

def clean_text(text):
    text = text.strip().lower()  # Strip and convert to lower case
    text = re.sub(r'\r\n', ' ', text)  # Remove carriage returns and line breaks
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'<.*?>', '', text)  # Remove any XML tags
    return text

# apply cleaning to the responses and query columns
train_data['query'] = train_data['query'].apply(clean_text)
train_data['response'] = train_data['response'].apply(clean_text)
validation_data['query'] = validation_data['query'].apply(clean_text)
validation_data['response'] = validation_data['response'].apply(clean_text)

# display a sample
validation_data

Unnamed: 0,query,response,intent,domain
0,how can i schedule an appointment with my doctor?,you can schedule an appointment by calling our...,appointment booking,healthcare
1,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance
2,can i make changes to my loan repayment schedule?,changes to your loan repayment schedule can be...,loan repayment adjustment,finance
3,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
4,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
...,...,...,...,...
595,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
596,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance
597,"i lost my credit card, what should i do?",please contact our customer service immediatel...,lost card reporting,finance
598,can i make changes to my loan repayment schedule?,changes to your loan repayment schedule can be...,loan repayment adjustment,finance


**TOKENIZATION**

In [15]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
# preprocessing function

def preprocess_function(examples):
    inputs = tokenizer(examples['query'], padding='max_length', truncation=True,  max_length=250)
    outputs = tokenizer(examples['response'], padding='max_length', truncation=True,  max_length=250)
    inputs['labels'] = outputs['input_ids']  # create a new key to 'Inputs' to track the output ids
    return inputs

# Apply the preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
validation_dataset = validation_data.apply(preprocess_function, axis=1)

In [17]:
train_data['query'][0]

'how do i apply for a student loan?'

In [18]:
# it is a dictonary
# input_ids are the query's tokenizations form
# labels are the response's tokenization form
train_dataset[0]

{'input_ids': [149, 103, 3, 23, 1581, 21, 3, 9, 1236, 2289, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

**Fine-Tuning the Model**

In [None]:

# Model call
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for checkpoints
    num_train_epochs=6,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,                # how often to log training info
    save_steps=500,                  # how often to save a model checkpoint
    eval_steps=50,                   # how often to run evaluation
    evaluation_strategy="epoch",     # Ensure evaluation happens every `epoch`
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtuli-rani-das[0m ([33mtuli-rani-das-khulna-university-of-engineering-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Save and Load Model

In [None]:
model.save_pretrained("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")

model = T5ForConditionalGeneration.from_pretrained("./chatbot_model")
tokenizer = T5Tokenizer.from_pretrained("./chatbot_model")

ChatBot System

In [None]:
device = model.device

def chatbot(query):
  # repeat all previous steps
  # cleaning the input query text
  query = clean_text(query)
  # tokenize the query
  input_ids = tokenizer(query, return_tensors="pt", truncation=True,  max_length=250)

  inputs = {key: value.to(device) for key, value in input_ids.items()} # inputs is a dictionary

  outputs = model.generate(
        input_ids["input_ids"],
        max_length=250,
        num_beams=5,
        early_stopping=True
  )
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

while True:
  user_input = input("You: ")
  if user_input.lower() == "exit":
    break
  response = chatbot(user_input)
  print("Chatbot:", response)

Download Model to your PC

In [None]:
import shutil

# Compress the saved model directory
shutil.make_archive("chatbot_model", 'zip', "./chatbot_model")


from google.colab import files

# Download the zip file to your PC
files.download("chatbot_model.zip")