# Gemma model

#### Gemma model setup for Langchain RAG

In [None]:
!pip install -qU langchain langchain_community sentence_transformers

In [None]:
!pip install -q bitsandbytes accelerate   # accelerate is for GPU and additionally bitsandbytes is for quantization

In [2]:
model_id = "google/gemma-2b-it"

In [None]:
#@title  for GPU

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

### model.save_pretrained("./model.")
### model = AutoModelForCausalLM.from_pretrained("./model")

In [None]:
#@title for CPU

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForCausalLM.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [4]:
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional

from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.messages.ai import AIMessage
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor
from transformers import pipeline
import re
import json
from typing import Any


class GemmaChatModel(BaseChatModel):
    """
    A custom chat model powered by Gemma from Hugging Face, designed to be informative, comprehensive, and engaging.
    See the custom model guide here: https://python.langchain.com/docs/modules/model_io/chat/custom_chat_model/
    """

    model_name: str = "gemma_chat_model"  # Replace with the actual Gemma model name
    task: str = "conversational"  # Task for the pipeline (conversational or summarization)
    #temperature = 0.0
    n: int = 2048
    model : Any = None
    tokenizer : Any = None


    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """
        Args:
            messages: The list of prompt messages.
            stop: Optional list of stop tokens.
            run_manager: Optional callback manager.
            **kwargs: Additional keyword arguments.

        Returns:
            A ChatResult object containing the generated response.
        """

        prompt = messages[-1].content #[: self.n]
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(device)
        outputs = self.model.generate(**input_ids, max_new_tokens=self.n)       # , temperature=self.temperature
        text = self.tokenizer.decode(outputs[0])
        #text = " ".join(text.split("\n"))

        start_index, end_index = text.find("<eos>"), text.rfind("<eos>")
        response = text[start_index+len("<eos>"):end_index].strip()

        message = AIMessage(content=response, additional_kwargs={}, response_metadata={"time_in_seconds": 3})
        return ChatResult(generations=[ChatGeneration(message=message)])

    @property
    def _llm_type(self) -> str:
        """
        Returns the type of language model used: "gemma_chat_model".
        """
        return "gemma_chat_model"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """
        Returns a dictionary of identifying parameters for LangChain callbacks.
        """
        return {"model_name": self.model_name, "task": self.task}

llm = GemmaChatModel()
llm.model = model               # This is simple but not production level way of doing things. It's just for avoiding colab run out of memory on CPU
llm.tokenizer = tokenizer

# Create Chat "Agent"

In [None]:
# https://python.langchain.com/docs/modules/memory/adding_memory/
# https://python.langchain.com/docs/modules/memory/adding_memory_chain_multiple_inputs/    ## RAG-augmented
# https://python.langchain.com/docs/modules/memory/conversational_customization/
# https://python.langchain.com/docs/modules/agents/quick_start/                            ## But you can't use this directly for gemma

In [25]:
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate

In [31]:
template = """You are a chatbot having a conversation with a human.

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,   #if you want intermediate steps to be printed
    memory=memory,
)

In [34]:
llm_chain.predict(human_input="Hi there my friend")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

[HumanMessage(content='Hi, my name is Lado'), AIMessage(content="Hello! It's a pleasure to meet you, Lado. What can I do for you today?"), HumanMessage(content='do you remember my name?'), AIMessage(content='I am a chatbot and do not have personal memories or the ability to remember names. I am designed to assist you with information and tasks based on the knowledge I have been trained on.')]
Human: Hi there my friend
Chatbot:[0m

[1m> Finished chain.[0m


"Hello! It's a pleasure to meet you, Lado. What can I do for you today?"

In [35]:
llm_chain.predict(human_input="Not too bad - how are you?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a chatbot having a conversation with a human.

[HumanMessage(content='Hi, my name is Lado'), AIMessage(content="Hello! It's a pleasure to meet you, Lado. What can I do for you today?"), HumanMessage(content='do you remember my name?'), AIMessage(content='I am a chatbot and do not have personal memories or the ability to remember names. I am designed to assist you with information and tasks based on the knowledge I have been trained on.'), HumanMessage(content='Hi there my friend'), AIMessage(content="Hello! It's a pleasure to meet you, Lado. What can I do for you today?")]
Human: Not too bad - how are you?
Chatbot:[0m

[1m> Finished chain.[0m


'I am doing well, thank you for asking. I am here to assist you with any questions or tasks you may have. How can I help you today?'

# Gradio UI

In [None]:
!pip install -q gradio

In [30]:
import gradio as gr

# Define the chatbot function
def respond(message, history):
  response = llm_chain.predict(human_input=message)
  return response

# Launch the Gradio interface
gr.ChatInterface(
    respond,
    title="My Chatbot",
    description="Enter your message and chat with the bot!",

).launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://33f98a69aa34448a1c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


