In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks


In [None]:
# !pip install langchain
!pip install langchain==0.1.6
!pip install langchain-cli==0.0.21
!pip install langchain-openai==0.0.6
!pip install huggingface_hub==0.21.4
!pip install python-dotenv==1.0.0
!pip install pydantic==1.10.13
!pip install pypdf
!pip install sentence-transformers
!pip install transformers
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple/


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, GemmaForCausalLM
from transformers import GemmaConfig, GemmaModel
from langchain_community.llms import HuggingFaceEndpoint

# Now, we have to provide knowledge base to the LLM model and here Knowledge Base is a PDF file. And the file contains ~700 pages. The bot that we are going to develop should answer queries from this PDF file only

In [None]:
#read the PDF file
#we are going to load the file using Langchain framework

file_name = "Hands on Machine Learning with Scikit Learn and Tensorflow.pdf"
loader = PyPDFLoader(file_name)
pdf_content = loader.load()
print(pdf_content)



In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 10
)
text_chunks = splitter.split_documents(pdf_content)
# print(text_chunks)
text_data = [data.page_content for data in text_chunks]
print(text_data)



In [None]:
class Retriever:
    """Sentence embedding based Retrieval Based Augmented generation.
        Given database of pdf files, retriever finds num_retrieved_docs relevant documents"""
    def __init__(self, text, num_retrieved_docs=5):
        # create a vectorstore database
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.db = FAISS.from_texts(text, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})

    def search(self, query):
        # retrieve top k similar documents to query
        docs = self.retriever.get_relevant_documents(query)
        return docs

In [None]:
import os
from google.colab import userdata

os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
class Assistant:
    """Gemma 2b based assistant that replies given the retrieved documents"""
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
        # CPU Enabled uncomment below 👇🏽
        # model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
        # GPU Enabled use below 👇🏽
        self.Gemma = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

    def create_prompt(self, query, retrieved_info):
        # instruction to areply to query given the retrived information
        prompt = f"""You need either to explain the concept or answer the question about Machine Learning.
        Be detailed, use simple words and examples in your explanations. If required, utilize the relevant information.
        If you doesn't know the answer, say that "Sorry, I don't know the answer" and add found relevant informations to it.
        Question: {query}
        Relevant information: {retrieved_info}
        Output:
        """
        return prompt

    def reply(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(prompt, return_tensors="pt").to("cuda")

        # Generate text with a focus on factual responses
        generated_text = self.Gemma.generate(
            **input_ids,
            max_length=500, # let answers be not that long
            temperature=0.1, # Adjust temperature according to the task, for code generation it can be 0.9
            max_new_tokens=500
        )
        # Decode and return the answer
        answer = self.tokenizer.decode(generated_text[0], skip_special_tokens=True)
        # answer = self.llm.invoke(prompt)
        return answer

In [None]:
bot = Assistant()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
retriever = Retriever(text_data)

In [None]:
def generate_reply(query):
  related_docs = retriever.search(query)
  reply = bot.reply(query, related_docs)
  return reply

In [None]:
reply = generate_reply("What is supervised learning ")
print(reply)

Both `max_new_tokens` (=500) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


You need either to explain the concept or answer the question about Machine Learning.
        Be detailed, use simple words and examples in your explanations. If required, utilize the relevant information.
        If you doesn't know the answer, say that "Sorry, I don't know the answer" and add found relevant informations to it.
        Question: What is supervised learning 
        Relevant information: [Document(page_content='Supervised/Unsupervised\tLearning\nMachine\t\nLearning\tsystems\tcan\tbe\tclassified\taccording\tto\tthe\tamount\tand\ttype\tof\tsupervision\tthey\tget\nduring\ttraining.\tThere\tare\tfour\tmajor\tcategories:\tsupervised\tlearning,\tunsupervised\tlearning,\nsemisupervised\tlearning,\tand\tReinforcement\tLearning.\nSupervised\tlearning\nIn\t\nsupervised\tlearning\n,\tthe\ttraining\tdata\tyou\tfeed\tto\tthe\talgorithm\tincludes\tthe\tdesired\tsolutions,\t\ncalled\nlabels\n\t(\nFigure\t1-5\n).\nFigure\t1-5.\t\nA\tlabeled\ttraining\tset\tfor\tsupervised\tlearning\t(