<a href="https://colab.research.google.com/github/Uditsingh7/Rag-Projects/blob/main/Udit_Rag_Project_GitHub_Issues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Building** a RAG using an open-source LLM, embeddings model, and LangChain.

In [None]:
## install dependencies

!pip install -q torch transformers accelerate bitsandbytes sentence-transformers

In [None]:
!nvcc --version

In [None]:
!pip install faiss-gpu-cu12

In [None]:
## q=quiet
!pip install -q langchain

In [None]:
!pip install -q langchain_community

In [None]:
!pip install -q sentence-transformers

In [None]:
from getpass import getpass

Access_Token = getpass("Please enter your GitHub Personal Access Token: ")


In [None]:
from os import access
##  load all of the issues in the huggingface/peft repo

from langchain.document_loaders import GitHubIssuesLoader

## By default, pull requests are considered issues as well,
##here we chose to exclude them from data with by setting include_prs=False

## state=all, means we will load both open and closed isses

repo = "huggingface/peft"
loader = GitHubIssuesLoader(repo=repo, access_token=Access_Token,
                            include_prs=False, state="all")

In [None]:
docs = loader.load()

In [None]:
docs[0]

In [None]:
## Keeping some overlap between chunks allows us to preserve some semantic context between the chunks.

## The recommended splitter for generic text is the RecursiveCharacterTextSplitter

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=30,
)

chnked_docs = splitter.split_documents(docs)

chnked_docs[0]

In [None]:
## Created embedders and Vectors

## To create document chunk embeddings we’ll use theHuggingFaceEmbeddings and the BAAI/bge-base-en-v1.5 embeddings model

from langchain.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

embeddings

In [None]:
## To create the vector database, we’ll use FAISS, a library developed by Facebook AI.
##This library offers efficient similarity search and clustering of dense vectors, which is what we need here.
##FAISS is currently one of the most used libraries for NN search in massive datasets.

from langchain.vectorstores import FAISS

db = FAISS.from_documents(chnked_docs, embeddings)

db


In [None]:
## We need a way to return(retrieve) the documents given an unstructured query.
##For that, we’ll use the as_retriever method using the db as a backbone

## Declare a retriever method with the vector db
retriever = db.as_retriever(search_type='similarity', search_kwargs= {"k":4})
retriever

In [None]:
## The vector database and retriever are now set up,
##next we need to set up the next piece of the chain — the model.


In [None]:
## Load quantized model
##  we chose HuggingFaceH4/zephyr-7b-beta, a small but powerful model.

## To make inference faster, will load the quantized version of the model:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model


In [None]:
## We have all the pieces we need to set the LLM chain
##loaded doc, splitted the doc, chnked it, embedded it, stored it in the vector db,
##defined a retriever method, loaded the qantised model, tokeniser


## Setup the LLM chain

## first create a text generation pipeline  using the loaded model and its tokenizer.

from transformers import pipeline
from langchain.llms import HuggingFacePipeline

text_generation_pipeline = pipeline(
    model=model, # The pre-trained language model object itself. This is typically an instance of a Hugging Face 'AutoModelForCausalLM'.
    tokenizer=tokenizer, # The tokenizer corresponding to the 'model'. It's essential for converting text to token IDs and vice-versa, which the model understands.
    task='text-generation', # Specifies the task for the pipeline. 'text-generation' means the pipeline will generate new text based on a given prompt.
    temperature=0.2, # Controls the randomness of the generated text. A value of 0.2 (low) makes the output more deterministic and focused on high-probability tokens, reducing creativity.
    do_sample=True, # Enables sampling-based text generation. When True, the model will pick tokens stochastically based on their probabilities, rather than just the most probable one.
    repetition_penalty=1.1, # Penalizes the generation of repeated tokens. A value of 1.1 (slightly above 1.0) discourages the model from repeating words or phrases too often, improving coherence.
    return_full_text=True, # Determines if the output should include the input prompt along with the generated text. True means the full text (prompt + generation) is returned.
    max_new_tokens=512, # The maximum number of new tokens to generate in addition to the input prompt. A value of 512 sets a reasonable limit on the length of the generated output.
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
llm


In [None]:
## Next, create a prompt template — this should follow the format of the model, so if you substitute the model checkpoint,
##make sure to use the appropriate formatting.

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

## StrOutputParser: take the output from a Language Model (LLM) and ensure it's returned as a plain string.
##Basically extration of content data in the form os string

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:
{context}
</s>
<|user|>
{question}
</s>
<|assistant|>
 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

## form a llm chain with prompt and llm
## | stands for LangChain Expression Language (LCEL).
##Its is sed to form a pipe or sequential composition
### It means o/p of starting from left will be  automatically passed on to the next as an i/p
llm_chain = prompt | llm | StrOutputParser()

llm_chain





In [None]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

rag_chain

In [None]:
question = "How do you combine multiple adapters?"

In [None]:
## First, let’s see what kind of answer we can get with just the model itself, no context added

llm_chain.invoke({"context":"", "question":question})

In [None]:
rag_chain.invoke(question)