# Installing required libraries
I have used Langchain framework to make this chatbot. I also used Bark for text-to-speech conversion.

In [None]:
!pip install -qqq transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -qqq datasets bitsandbytes
!pip install -qqq torch
!pip install -qqq langchain

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

# Importing all dependencies
1. Peft helps to load the model that I finetuned from HuggingFace Hub.
2. Transformers module helps to load the base model that is Llama-2-7B as well as the Bark model.
3. Langchain helps to create a prompt template and handle a stepwise process easily.
4. IPython helps to play the audio in Google Colab.

In [None]:
import bitsandbytes as bnb
import pandas as pd
import torch
from peft import (
    PeftConfig,
    PeftModel,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    pipeline,
    AutoProcessor,
    AutoModel
)
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain, PromptTemplate
from IPython.display import Audio, display

# Ignores warnings
import warnings
warnings.filterwarnings('ignore')

# Loading the model
I used 4-bit quantization to load the model, along with double quantization.

In [None]:
# Configurations for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

The finetuned QLoRA can be found [here](https://huggingface.co/Phoenix10062002/llama2-faq-chatbot). I created this by finetuning Llama-2-7B model with a E-commerce FAQ dataset which can be found [here](https://huggingface.co/datasets/Andyrasika/Ecommerce_FAQ).

In [None]:
PEFT_MODEL = "Phoenix10062002/llama2-faq-chatbot"

config=PeftConfig.from_pretrained(PEFT_MODEL)     # Loading the configuration of the QLoRA
model=AutoModelForCausalLM.from_pretrained(       # Loading the base model
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Loading the tokenizer for the base model
tokenizer= AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Wrapping the base model and the QLoRA
model = PeftModel.from_pretrained(model, PEFT_MODEL)

Downloading (…)/adapter_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00014.bin:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00014.bin:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

# Setting the generation configuration

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p=0.7
generation_config.num_return_sequences=1
generation_config.pad_token_id=tokenizer.eos_token_id

# This stops the model from over generating
generation_config.eos_token_id = tokenizer('### Assistant')["input_ids"]

# Creating a HuggingFace Pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    repetition_penalty=1.15,
    generation_config=generation_config,
    do_sample=True                          # Randomizes the output for the same input
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'O

# Using Langchain to create the chain
I tried to add memory to the chain but most of the time it didn't work properly, so I decided to remove it. Maybe more training or using a larger model would be better.

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)    # Creates a pipeline in Langchain

# Creating the prompt template
template = """
### Instruction: You are an e-commerce chatbot named Helpie. Answer user queries and be respectful.
If you don't know any answer just say you don't know.

### Human: {human_input}
### Assistant:
""".strip()

prompt = PromptTemplate(
    input_variables = ['human_input'], template=template
)

# Setting up the chain
chain = LLMChain(
    llm = llm,
    prompt = prompt,
)


# Loading the BARK model
BARK is a text-to-speech model available in HuggingFace hub. Here I am using `bark-small` which has lesser parameter so that it can fit in Google Colab's memory.
The model can be found [here](https://huggingface.co/suno/bark-small).

In [None]:
processor = AutoProcessor.from_pretrained("suno/bark-small")
tts_model = AutoModel.from_pretrained("suno/bark-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

Downloading (…)embeddings_path.json:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/8.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/4.91k [00:00<?, ?B/s]

In [None]:
# Moving the model to the GPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tts_model = tts_model.to(device)

# Defining the utility functions
1. `generate_response` - This takes the human input as a parameter and returns the generated output from the LLM.
2. `get_voice` - This takes the response as a parameter and converts it to speech. This function plays the audio as well.

In [None]:
def generate_response(human_input):

  response = chain.predict(human_input = human_input)
  response = response[:response.find("\n###")]        # Removes any extra generations
  return response

In [None]:
def get_voice(response):

  voice_preset = "v2/en_speaker_8"                    # This is a preset male voice available in BARK

  inputs = processor(response, voice_preset=voice_preset)
  speech_values = tts_model.generate(**inputs.to(device), do_sample=True)
  sampling_rate = tts_model.generation_config.sample_rate

  # Plays the audio in Colab
  display(Audio(speech_values[0].cpu().numpy(), rate=sampling_rate, autoplay=True))


# Creating the final chatbot
The `chat_bot` function brings everything together. It runs an infinite loop that asks for user input and calls the `generate_response` and `get_voice` functions.
It also prints the output.</br>
**To exit the chatbot type 'Bye'.**

In [None]:
def chat_bot():

  while(True):
    human_input = input("You: ")                  # Takes input from user
    if(human_input.lower() == "bye"):             # Type 'Bye' to exit
      return

    # Gets the response and plays the audio
    response = generate_response(human_input=human_input)
    get_voice(response)

    print("Helpie:", response)

# Wraps the output to fit the screen

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Run this cell to start the Chatbot
There is an internal warning message generated by BARK. I could not find any fix for that online. Please ignore the message.</br>
**Type 'Bye' to stop the chatbot**

In [None]:
chat_bot()

You: hello what is your name?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Helpie:  My name is Helpie, the friendly chatbot assistant who can assist you with your inquiries regarding our products or services. Feel free to ask me anything related to your order, shipping details, returns policy, payment methods, etc. I am always happy to help.

You: What do you think of this website?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Helpie:  We strive to provide our customers with a seamless shopping experience, offering high-quality products at competitive prices. Our team is always available for assistance if you have any questions or concerns during the ordering process. Thank you for your feedback!
You: bye
