# Gemma model and GTE

#### Gemma model setup for Langchain RAG

In [6]:
!pip install -qU langchain langchain_community sentence_transformers

In [None]:
!pip install -q bitsandbytes accelerate   # accelerate is for GPU and additionally bitsandbytes is for quantization

In [7]:
model_id = "google/gemma-2b-it"

In [None]:
#@title  for GPU

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

### model.save_pretrained("./model.")
### model = AutoModelForCausalLM.from_pretrained("./model")

In [None]:
#@title for CPU

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForCausalLM.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [10]:
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional

from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.messages.ai import AIMessage
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor
from transformers import pipeline
import re
import json
from typing import Any


class GemmaChatModel(BaseChatModel):
    """
    A custom chat model powered by Gemma from Hugging Face, designed to be informative, comprehensive, and engaging.
    See the custom model guide here: https://python.langchain.com/docs/modules/model_io/chat/custom_chat_model/
    """

    model_name: str = "gemma_chat_model"  # Replace with the actual Gemma model name
    task: str = "conversational"  # Task for the pipeline (conversational or summarization)
    #temperature = 0.0
    n: int = 2048
    model : Any = None
    tokenizer : Any = None


    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """
        Args:
            messages: The list of prompt messages.
            stop: Optional list of stop tokens.
            run_manager: Optional callback manager.
            **kwargs: Additional keyword arguments.

        Returns:
            A ChatResult object containing the generated response.
        """

        prompt = messages[-1].content #[: self.n]
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(device)
        outputs = self.model.generate(**input_ids, max_new_tokens=self.n)       # , temperature=self.temperature
        text = self.tokenizer.decode(outputs[0])
        #text = " ".join(text.split("\n"))

        start_index, end_index = text.find("<eos>"), text.rfind("<eos>")
        response = text[start_index+len("<eos>"):end_index].strip()

        message = AIMessage(content=response, additional_kwargs={}, response_metadata={"time_in_seconds": 3})
        return ChatResult(generations=[ChatGeneration(message=message)])

    @property
    def _llm_type(self) -> str:
        """
        Returns the type of language model used: "gemma_chat_model".
        """
        return "gemma_chat_model"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """
        Returns a dictionary of identifying parameters for LangChain callbacks.
        """
        return {"model_name": self.model_name, "task": self.task}

llm = GemmaChatModel()
llm.model = model               # This is simple but not production level way of doing things. It's just for avoiding colab run out of memory on CPU
llm.tokenizer = tokenizer

#### Custom embedding model setup for lanchain RAG

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

# Routing

- Logical routing
- Semantic routing


In [None]:
! pip install tiktoken langchainhub chromadb youtube-transcript-api pytube

In [14]:
import os
from google.colab import userdata

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')

## Logical and Semantic routing

Use function-calling for classification.


Docs:

https://python.langchain.com/docs/use_cases/query_analysis/techniques/routing#routing-to-multiple-indexes

#### Logical Routing

In [80]:
import re
#from langchain_core.load.dump import dumps  #or#  import json

# Function to extract the leading word from a code block
def artificial_parser(code_block):
    class DotDict(dict):
      def __getattr__(self, attr):
          if attr in self:
              return self[attr]
          else:
              raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
    # Regular expression pattern to match the leading word of the code block
    pattern = r"```(\w+)"
    # Find the leading word using regular expression
    match = re.search(pattern, code_block)
    # Extract the leading word if found
    if match:
        leading_word = match.group(1)
        obj = DotDict({"datasource" : leading_word, "response" : code_block})
        return obj
    else:
        return None

In [76]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser


# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(
        ...,
        description="Given a user question choose which datasource would be most relevant for answering their question",
    )

# LLM with function call
## we cannot do this: structured_llm = llm.with_structured_output(RouteQuery)
# but you can somewhat mimic this. We discussed some of them here: https://github.com/VladimerKhasia/ML-in-Notebooks/blob/main/custom%20GenAI%20and%20tools/gemma_coding_assistant.ipynb

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=RouteQuery)

# Prompt
system = """You are an expert at routing a user question to the appropriate data source.

Based on the programming language the question is referring to, route it to the relevant data source.\n
Use these format instructions for the structure of your answer: {format_instructions}."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        #few_shot_prompt,     # additionally you can inject here few shot prompts: https://python.langchain.com/docs/modules/model_io/prompts/few_shot_examples_chat/
        ("human", "{question}"),
    ]
)

# Define router
router = prompt | llm | StrOutputParser() | artificial_parser #| parser # to actually use the parser for validation you have more work to do, as the actual output is far from the required

Note: we used function calling to produce structured output.

In [77]:
question = """Why doesn't the following code work:

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(["human", "speak in {language}"])
prompt.invoke("french")
"""

result = router.invoke({"question": question, "format_instructions": parser.get_format_instructions()})

In [None]:
result

In [79]:
# instead of directly accessing: result.datasource
# requested struture also did not work well you can use few-shot technics to improve it
result.datasource

'python'

Once we have this, it is trivial to define a branch that uses `result.datasource`

https://python.langchain.com/docs/expression_language/how_to/routing

In [82]:
def choose_route(result):
    if "python" in result.datasource.lower():
        ### Logic here
        return "chain for python_docs"
    elif "js" in result.datasource.lower():
        ### Logic here
        return "chain for js_docs"
    else:
        ### Logic here
        return "golang_docs"

from langchain_core.runnables import RunnableLambda

full_chain = router | RunnableLambda(choose_route)

In [84]:
full_chain.invoke({"question": question, "format_instructions": parser.get_format_instructions()})

'chain for python_docs'

### Semantic routing

Docs:

https://python.langchain.com/docs/expression_language/cookbook/embedding_router

In [85]:
from langchain.utils.math import cosine_similarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# Two prompts
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise and easy to understand manner. \
When you don't know the answer to a question you admit that you don't know.

Here is a question:
{query}"""

math_template = """You are a very good mathematician. You are great at answering math questions. \
You are so good because you are able to break down hard problems into their component parts, \
answer the component parts, and then put them together to answer the broader question.

Here is a question:
{query}"""

# Embed prompts
prompt_templates = [physics_template, math_template]
prompt_embeddings = embeddings.embed_documents(prompt_templates)

# Route question to prompt
def prompt_router(input):
    # Embed question
    query_embedding = embeddings.embed_query(input["query"])
    # Compute similarity
    similarity = cosine_similarity([query_embedding], prompt_embeddings)[0]
    most_similar = prompt_templates[similarity.argmax()]
    # Chosen prompt
    print("Using MATH" if most_similar == math_template else "Using PHYSICS")
    return PromptTemplate.from_template(most_similar)


chain = (
    {"query": RunnablePassthrough()}
    | RunnableLambda(prompt_router)
    | llm
    | StrOutputParser()
)

print(chain.invoke("What's a black hole"))

Using PHYSICS
I know that a black hole is a region of spacetime where gravity is so strong that nothing, not even light, can escape. But what about the center of a black hole? Is it also a region of spacetime where gravity is so strong that nothing, not even light, can escape?

Can you explain this concept in a simple and easy to understand way?


# Query Construction

For graph and SQL, see helpful resources:

https://blog.langchain.dev/query-construction/

https://blog.langchain.dev/enhancing-rag-based-applications-accuracy-by-constructing-and-leveraging-knowledge-graphs/

## Query structuring for metadata filters


Many vectorstores contain metadata fields.

This makes it possible to filter for specific chunks based on metadata.

Let's look at some example metadata we might see in a database of YouTube transcripts.

Docs:

https://python.langchain.com/docs/use_cases/query_analysis/techniques/structuring

In [86]:
from langchain_community.document_loaders import YoutubeLoader

docs = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=pbAd8O1Lvm4", add_video_info=True
).load()

docs[0].metadata

{'source': 'pbAd8O1Lvm4',
 'title': 'Self-reflective RAG with LangGraph: Self-RAG and CRAG',
 'description': 'Unknown',
 'view_count': 14063,
 'thumbnail_url': 'https://i.ytimg.com/vi/pbAd8O1Lvm4/hq720.jpg',
 'publish_date': '2024-02-07 00:00:00',
 'length': 1058,
 'author': 'LangChain'}

Let’s assume we’ve built an index that:

1. Allows us to perform unstructured search over the `contents` and `title` of each document
2. And to use range filtering on `view count`, `publication date`, and `length`.

We want to convert natural langugae into structured search queries.

We can define a schema for structured search queries.

In [87]:
import datetime
from typing import Literal, Optional, Tuple
from langchain_core.pydantic_v1 import BaseModel, Field

class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library."""

    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts.",
    )
    title_search: str = Field(
        ...,
        description=(
            "Alternate version of the content search query to apply to video titles. "
            "Should be succinct and only include key words that could be in a video "
            "title."
        ),
    )
    min_view_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified.",
    )
    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified.",
    )
    earliest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified.",
    )
    latest_publish_date: Optional[datetime.date] = Field(
        None,
        description="Latest publish date filter, exclusive. Only use if explicitly specified.",
    )
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified.",
    )

    def pretty_print(self) -> None:
        for field in self.__fields__:
            if getattr(self, field) is not None and getattr(self, field) != getattr(
                self.__fields__[field], "default", None
            ):
                print(f"{field}: {getattr(self, field)}")

Now, we prompt the LLM to produce queries.

In [88]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a database query optimized to retrieve the most relevant results.\n
Use these format instructions for the structure of your answer: {format_instructions}.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
## Same here as above: we cannot do this with our model. Let's mimic it. You can also try .with_structured_output from langchain for better results beyond what we do here.
##structured_llm = llm.with_structured_output(TutorialSearch)

# Set up a parser instead + inject instructions into the prompt template + use few-shot prompts etc.
parser = PydanticOutputParser(pydantic_object=TutorialSearch)

query_analyzer = prompt | llm #| StrOutputParser() | artificial_parser #| parser ## we use artificially created parser not the actual parser which is basically rather a validator
                              # I am not implementing the artificial_parser again for this case, but you get the idea from previous artificial_parser right?


In [None]:
result = query_analyzer.invoke({"question": "videos that are focused on the topic of chat langchain that are published before 2024", "format_instructions": parser.get_format_instructions()})
result.pretty_print()
                              # Example structure of the result you should get in case you decide to play with implementation of artificial_parser:

                              # content_search: chat langchain
                              # title_search: chat langchain
                              # earliest_publish_date: 2024-01-01

                              # artificial_parser is not a part of langchain and I showed it to you 
                              # just to partially overcome the limitations of models with no function calling pre-training.
                              # additional helper tool for that may be few-shot prompting and so on.

To then connect this to various vectorstores, you can follow [here](https://python.langchain.com/docs/modules/data_connection/retrievers/self_query#constructing-from-scratch-with-lcel).