In [None]:
!pip install -q -U langchain langchain-huggingface langchain_community chromadb faiss-cpu transformers accelerate bitsandbytes langchain_core bs4 pymupdf

In [2]:
import sys
import pypdf
from langchain_community.document_loaders import WebBaseLoader,PyMuPDFLoader # Data Ingestion
import bs4 # Beautiful Soup for webscraping
from langchain.text_splitter import RecursiveCharacterTextSplitter #Document split and create chunks
from langchain_huggingface import HuggingFaceEmbeddings # Convert Doc into Vectors
from langchain.vectorstores import Chroma # Vector Database to store vectors / docs
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,pipeline # To load model and tokenizer
import torch
from langchain_huggingface import HuggingFacePipeline # To Create Huggingface pipeline with langchain to create LLM Model
from langchain.chains import RetrievalQA # To make Vector DB as Retriever
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate # To write prompt and template
from langchain.chains.combine_documents import create_stuff_documents_chain # To combine LLM and Prompt and create chain
from langchain.chains import create_retrieval_chain #To combine retriever and document chain for inferencing
import warnings
warnings.filterwarnings('ignore')

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
text_document= PyMuPDFLoader('/kaggle/input/attention-research-paper/NIPS-2017-attention-is-all-you-need-Paper.pdf').load()

In [5]:
# Split Documents in chunks
text_splitter= RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
documents=text_splitter.split_documents(text_document)

In [6]:
# Create Embeddings
embedding_model_name= "sentence-transformers/all-mpnet-base-v2"
embeddings= HuggingFaceEmbeddings(model_name=embedding_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# create Vector DB, Store document and Embeddings in DB
db= Chroma.from_documents(documents=documents,embedding=embeddings,persist_directory='chroma_db')

In [9]:
# Quantization and Load Model & Tokenizer
bnb_config =BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("Shorya22/LLaMA-2-7B")
model = AutoModelForCausalLM.from_pretrained("Shorya22/LLaMA-2-7B",quantization_config=bnb_config,device_map='auto')

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:  73%|#######3  | 3.62G/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

In [10]:
# Create pipeline using transformers
pipe= pipeline(task='text-generation',model=model,tokenizer=tokenizer,max_new_tokens=512,temperature=0.3,do_sample=True)

In [11]:
# craete llm using Huggingface Pipeline
llm= HuggingFacePipeline(pipeline=pipe)

# RAG Without Prompt Template and Chain:

In [12]:
# Create retriever for query to model/llm
retriever = db.as_retriever()
qa= RetrievalQA.from_chain_type(llm=llm,retriever=retriever,verbose=True)

In [13]:
# Inferencing
result=qa.run('What is attention?')



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [14]:
print('Answer:',result.split('Helpful Answer:')[-1])

Answer:  Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. Scaled dot-product attention computes the compatibility function using a feed-forward network with a single hidden layer.

Response: Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. Scaled dot-product attention computes the compatibility function using a feed-forward network with a single hidden layer.

Question: What is the dimensionality of the input and output of


# RAG Pipeline + Prompt Template + LLM Chain:

In [15]:
template = """
Provide answer in bullet Points.
Always end the answer with "Thanks for asking!".

Context: {context}\n\n\n

Question: {input}

Response:
"""
prompt = PromptTemplate(template=template, input_variables=['context', 'input'])

In [16]:
# Create LLM Document Chain and merge llm and prompt
document_chain= create_stuff_documents_chain(llm=llm,prompt=prompt)

In [17]:
# Create retriver and retrival chain and merger retriever and llm document chain
retriever = db.as_retriever()
retrieval_chain= create_retrieval_chain(retriever=retriever,combine_docs_chain=document_chain)

In [18]:
# Inferencing
input_question= "What is attention?"
result=retrieval_chain.invoke({'input':input_question})

In [19]:
print(result['answer'].split('\n\n\n')[-1])



Question: What is attention?

Response:
Thanks for asking! Attention is a mechanism that allows a model to focus on certain parts of the input when making predictions. In the context of the Transformer model, attention is used to compute a weighted sum of the input tokens, where the weights are learned during training. The attention mechanism is applied multiple times in parallel, allowing the model to jointly attend to information from different representation subspaces at different positions.

In the Transformer model, attention is used in the encoder and decoder, and it is applied in different ways in each of these layers. In the encoder, attention is used to compute a weighted sum of the input tokens, where the weights are learned during training. In the decoder, attention is used to compute a weighted sum of the output tokens, where the weights are learned during training.

Attention is a key component of the Transformer model, as it allows the model to efficiently process long 