<a href="https://colab.research.google.com/github/NirDiamant/LLM-tasks/blob/main/RAG_FAISS_Harry_Potter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [None]:
# Install necessary libraries
!pip install -qU langchain accelerate bitsandbytes transformers sentence-transformers faiss-gpu

## Import libraries

In [2]:
# Import required libraries and modules
import os
from glob import glob
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip3 install pypdf

 ## Initialize Model and Tokenizer with BitsAndBytes Configuration

In [None]:
# Configure BitsAndBytes for efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

# Load the model with the above configuration
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    quantization_config=bnb_config,
    do_sample=True,

)

# Initialize tokenizer and set padding
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Set up Text Generation Pipeline

In [62]:
# Set up the text generation pipeline with specific parameters
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    temperature=0.00001,
    task="text-generation",
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=2000,

)

# Create a HuggingFacePipeline instance for text generation
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## Prepare the Prompt Template that demands that the answer will be from the given context

In [66]:
# Define the prompt template for generating text
prompt_template = """
Instruction: prompt=f"Answer the following question based only on the provided context:{context}
If the answer is contained in the context, print "Answer:", and provide the answer from the context.
Also print "reference:" and show me from which part of the context your retrieved this answer.
If the answer does not appear in the context, answer: \"The answer isn't in the data you supplied\""

Question:
{question}
"""

prompt = PromptTemplate(
   input_variables=["context", "question"],
   template=prompt_template,
)

## Create LLM Chain

In [67]:
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

##  Load and Process PDF Documents

In [None]:
path ="/content/Harry Potter - Book 1 - The Sorcerers Stone.pdf"

In [None]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(path)
pages = loader.load_and_split()

In [None]:
# Create a deep copy of the pages list to retain original objects intact
import copy
cleaned_pages = copy.deepcopy(pages)

# Clean the page_content of each page in the cleaned_pages list
for page in cleaned_pages:
    page.page_content = page.page_content.replace('\t', ' ')  # Replace tab characters with spaces

# Now, cleaned_pages contains the original objects with cleaned page_content

## Index Documents with FAISS

In [None]:
# Index the processed documents with FAISS for efficient retrieval
db = FAISS.from_documents(
    cleaned_pages,
    HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
)

# Convert the FAISS index into a retriever
retriever = db.as_retriever()

## Save the retriever db to the disk (or cloud)

In [8]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
def load_object(filename):
    with open(filename, 'rb') as inp:  # Open the file in binary read mode
        obj = pickle.load(inp)
    return obj

In [None]:
save_object(retriever, 'hp_retriever.pkl')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
# Define the source path of your file (the one you want to copy)
source_path = '/content/hp_retriever.pkl'

# Define the destination path in your Google Drive
destination_path = '/content/drive/My Drive/rag/retriever_file_hp'  # Update this path

In [None]:
# Copying the file
import shutil
shutil.copy(source_path, destination_path)

print(f"File copied to Google Drive successfully: {destination_path}")

In [11]:
loaded_retriever= load_object(destination_path)

## Ask you question about the data

In [74]:
question = "who is obama?"

## Retrive relevant pages from the book

In [75]:
docs = loaded_retriever.get_relevant_documents(question)

## Concat the relevant content to create a string context

In [76]:
context = " ".join(doc.page_content for doc in docs)

In [None]:
context

## Execute chain and show response

In [None]:
result = llm_chain.run(context=context, question=question)

In [None]:
print(result)