<a target="_blank" href="">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# Install necessary libs
!pip install transformers torch accelerate einops langchain xformers bitsandbytes huggingface_hub gpt4all sentence_transformers faiss-gpu
# !pip install faiss-gpu

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.337-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl (211.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/

In [2]:
import transformers
from torch import cuda, bfloat16
from langchain import HuggingFacePipeline
import tqdm as notebook_tqdm
import os

In [3]:
import transformers
from torch import cuda, bfloat16

class LLMModel:
    """
    A wrapper class for language models used in text generation tasks, with support for quantization.

    Parameters:
    - model_name (str): The name or identifier of the pretrained language model.
    - hf_token (str): Hugging Face authentication token for accessing private models.

    Attributes:
    - device (str): The device to run the model on, 'cuda' if available, otherwise 'cpu'.
    - quantization_config (transformers.BitsAndBytesConfig): Configuration for model quantization.
    - model (transformers.AutoModelForCausalLM): The pretrained language model.
    - tokenizer (transformers.AutoTokenizer): The tokenizer for the language model.
    - pipeline: transformers.Pipeline: A text generation pipeline.
    """

    def __init__(self, model_name, hf_token):
        """
        Class constructor.

        Args:
            model_name (str): The name or identifier of the pretrained language model.
            hf_token (str): Hugging Face authentication token for accessing private models.
        """
        self.model_name = model_name
        self.hf_token = hf_token
        self.device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
        self.quantization_config = None
        self.model = None
        self.tokenizer = None
        self.pipeline = None

    def init_quantitizing(self):
        """
        Initializes the quantization configuration for the model.
        """
        self.quantization_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

    def init_model(self, quantitizing=True):
        """
        Initializes the pretrained language model.

        Args:
            quantitizing (bool, optional): Whether to apply quantization. Defaults to True.
        """
        # Define common parameters
        common_params = {
            "pretrained_model_name_or_path": self.model_name,
            "use_auth_token": self.hf_token,
            "trust_remote_code": True,
            "device_map": "auto",
        }

        # Define specific parameters
        specific_params = {"quantization_config": self.quantization_config} if quantitizing else {}

        # Initialize Model
        model_params = {**common_params, **specific_params}
        self.model = transformers.AutoModelForCausalLM.from_pretrained(**model_params)

    def init_tokenizer(self):
        """
        Initializes the tokenizer for the language model.
        """
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            self.model_name,
            use_auth_token=self.hf_token
        )

    def init_pipeline(self):
        """
        Initializes the text generation pipeline.
        """
        self.pipeline = transformers.pipeline(
            model=self.model,
            task='text-generation',
            tokenizer=self.tokenizer
        )

    def create_text_generator(self, quantitizing=True):
        """
        Sets up the entire text generation environment by calling init_quantitizing(),
        init_model(), init_tokenizer(), and init_pipeline().
        """
        if quantitizing:
          self.init_quantitizing()

        self.init_model(quantitizing)
        self.init_tokenizer()
        self.init_pipeline()

    def generate_response(self, message):
        """
        Generates a text response given an input message.

        Args:
            message (str): Input message for text generation.

        Returns:
            str: Generated text response.
        """
        res = self.pipeline(message)
        return res[0]["generated_text"]


In [4]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

hf_auth = "hf_..."

llm = LLMModel(model_id, hf_auth)

llm.create_text_generator()



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
import logging
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain

class ChainModel:
    """
    A class representing a conversational retrieval chain using Hugging Face models.
    """
    def __init__(self, model_pipeline):
        """
        Class constructor.

        Args:
            model_pipeline: The HuggingFace pipeline for language processing.
        """
        self.pipeline = model_pipeline
        self.chat_history = []
        self.vectorstore = None

    def init_web_loader(self, web_links):
        """
        Initialize the web loader and vector store.

        Args:
            web_links (list): List of web links to load documents from.
        """
        # Loader Data
        loader = WebBaseLoader(web_links)
        documents = loader.load()

        # Split in chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
        all_splits = text_splitter.split_documents(documents)

        # Create embeddings
        model_name = "sentence-transformers/all-mpnet-base-v2"
        model_kwargs = {"device": "cuda"}
        embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

        # Store embeddings in the vector store
        self.vectorstore = FAISS.from_documents(all_splits, embeddings)

    def init_chain(self, loader, data_source_list):
        """
        Initialize the conversational retrieval chain.

        Args:
            loader (str): The type of loader, e.g., "web".
            data_source_list: List of data sources specific to the loader.
        """
        if loader == "web":
            logging.info('Initializing web loader...')
            self.init_web_loader(data_source_list)

        self.llm = HuggingFacePipeline(pipeline=self.pipeline)
        self.chain = ConversationalRetrievalChain.from_llm(
            self.llm, self.vectorstore.as_retriever(), return_source_documents=True
        )

    def generate_response(self, question):
        """
        Generate a response to the input question using the conversational retrieval chain.

        Args:
            question (str): The input question.

        Returns:
            str: The generated response.
        """
        result = self.chain({"question": question, "chat_history": self.chat_history})
        self.chat_history = [(question, result["answer"])]
        return result['answer']


In [47]:
chain = ChainModel(llm.pipeline)

chain.init_chain(loader="web", data_source_list=["https://realpython.com/chromadb-vector-database/"])

In [48]:
chain.generate_response("What is ChromaDB")

" ChromaDB is a vector database designed for semantic search and embedding-based queries. It stores documents as vectors in a high-dimensional space and provides a simple API for creating and querying collections of documents.\n\nIn this case, you'll use the ChromaDB client to create a collection of car reviews and add them to the database. Then, you can query the collection using the embeddings and semantic search functionality provided by ChromaDB."

In [49]:
chain.generate_response("What is retrieval-augmented generation?")

'  In the context of ChromaDB, retrieval-augmented generation refers to the process of using a vector database to generate new data by combining and transforming existing vectors. This can be useful for tasks such as text summarization, image synthesis, and language translation.\n\nIn the context of ChromaDB, retrieval-augmented generation can be achieved through various methods, including:\n\n1. Vector concatenation: Combining vectors from different sources to create new vectors that capture the desired characteristics.\n2. Vector transformation: Transforming vectors from one domain to another, such as from text to image, or from one language to another.\n3. Vector generation: Generating new vectors from scratch, rather than combining or transforming existing vectors.\n\nBy leveraging retrieval-augmented generation in ChromaDB, users can create new data that is not present in the original collection, but is derived from the existing vectors. This can be particularly useful for tasks t