In [1]:
! nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-263cdb87-e7a3-3f88-89ec-2b7b5911f651)
GPU 1: Tesla T4 (UUID: GPU-6a0c2544-b3be-dcd7-14fc-42992e91ec69)


In [3]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding 

! pip install -qq -U transformers 
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

clear_output()

CPU times: user 1.48 s, sys: 396 ms, total: 1.87 s
Wall time: 1min 53s


In [6]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 2.29 ms, sys: 1.08 ms, total: 3.37 ms
Wall time: 2.58 ms


In [7]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.3.2
torch: 2.4.0
transformers: 4.45.1


In [8]:
sorted(glob.glob('/kaggle/input/ncert-pdfs/ncert/*'))

['/kaggle/input/ncert-pdfs/ncert/NCERT-Class-10-Science.pdf',
 '/kaggle/input/ncert-pdfs/ncert/class 9 science.pdf',
 '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf']

In [21]:
class CFG:
    # LLMs
    model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15    

    # splitting
    split_chunk_size = 800
    split_overlap = 0
    
    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'    

    # similar passages
    k = 6
    
    # paths
    PDFs_path = '/kaggle/input/ncert-pdfs/ncert/'
    Embeddings_path =  '/kaggle/input/faiss-hp-sentence-transformers'
    Output_folder = './harry-potter-vectordb'

In [10]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'wizardlm':
        model_repo = 'TheBloke/wizardLM-7B-HF'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )        

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True
        )
        
        max_len = 1024

    elif model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )
        
        max_len = 2048

    elif model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )
                
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,       
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )
        
        max_len = 2048 # 8192

    elif model == 'mistral-7B':
        model_repo = 'mistralai/Mistral-7B-v0.1'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )        

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )
        
        max_len = 1024

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [13]:

tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()


Downloading model:  llama2-13b-chat 




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


KeyboardInterrupt



In [14]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [15]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 0,
 'model.layers.14': 0,
 'model.layers.15': 0,
 'model.layers.16': 0,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 1,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layers.24': 1,
 'model.layers.25': 1,
 'model.layers.26': 1,
 'model.layers.27': 1,
 'model.layers.28': 1,
 'model.layers.29': 1,
 'model.layers.30': 1,
 'model.layers.31': 1,
 'model.layers.32': 1,
 'model.layers.33': 1,
 'model.layers.34': 1,
 'model.layers.35': 1,
 'model.layers.36': 1,
 'model.layers.37': 1,
 'model.layers.38': 1,
 'model.layers.39': 1,
 'model.norm': 1,
 'model.rotary_emb': 1,
 'lm_head': 1}

In [16]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [17]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f68f0756d40>)

In [18]:
%%time
### testing model, not using the harry potter books yet
### answer is not necessarily related to harry potter
query = "Give me 5 examples of cool potions and explain what they do"
llm.invoke(query)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


CPU times: user 49.2 s, sys: 254 ms, total: 49.5 s
Wall time: 50.5 s


"Give me 5 examples of cool potions and explain what they do.\n\nSure, here are five examples of cool potions that you might find in a fantasy story:\n\n1. The Elixir of Eternal Youth: This potion grants the drinker eternal youth, keeping them looking and feeling like they did in their prime for as long as they live. However, it also comes with a catch - the drinker must consume the elixir every year on their birthday, or else they will begin to age rapidly and lose all of the benefits of the potion.\n2. The Potion of Healing: This potion can cure any wound or illness, no matter how severe. It works by drawing out the poison or injury from the drinker's body and replacing it with pure, healing energy. However, the potion can only be used once per day, and it leaves the drinker feeling drained and weak afterward.\n3. The Draught of Dreams: This potion allows the drinker to enter into a deep, lucid dream state, where they can explore their subconscious mind and unlock hidden secrets abou

In [19]:
CFG.model_name

'llama2-13b-chat'

In [22]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 3/3 [06:28<00:00, 129.50s/it]

CPU times: user 6min 28s, sys: 2.52 s, total: 6min 30s
Wall time: 6min 28s





In [23]:
print(f'We have {len(documents)} pages in total')

We have 772 pages in total


In [24]:
documents[8].page_content

'To\nThe Head\nDepartment of Education in\nScience and Mathematics,\nNCER T, Sri Aur obindo Mar g,\nNew Delhi 110016The team of Paheli and Boojho will be with you as you journey through this\ntextbook. They love to ask questions. All kinds of questions come to their\nminds and they collect them in their sacks. Sometimes, they may share some\nof the questions with you, as you read through the chapters.\nPaheli and Boojho are also on the lookout for answers to many\nquestions — sometimes the questions seem answered after they discuss them\nwith each other , sometimes thr ough discussions with other classmates, teachers\nor their parents. Answers to some questions do not seem available even after\nall these. They might need to experiment on their own, read books in the library,\nsend questions to scientists. Just dig and dig and dig into all possibilities and\nsee if the questions can be answered. Perhaps, they would carry some of the\nunanswered questions in their sacks to higher classes

In [25]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 2872 chunks from 772 pages


In [26]:
%%time

### we create the embeddings only if they do not exist yet
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts, 
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 12.7 s, sys: 576 ms, total: 13.3 s
Wall time: 17.8 s


In [29]:
%%time

### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = CFG.embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
    #CFG.Embeddings_path, # from input folder
    CFG.Output_folder + '/faiss_index_hp', # from output folder
    embeddings,
    allow_dangerous_deserialization=True
)

clear_output()

CPU times: user 96.9 ms, sys: 13 ms, total: 110 ms
Wall time: 114 ms


In [31]:
### test if vector DB was loaded correctly
vectordb.similarity_search('balanced diet')

[Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf', 'page': 133}, page_content='REACHING  THE AGE OF ADOLESCENCE 121adolescent has to be carefully planned.\nYou have alr eady lear nt what a balanced\ndiet is. Recall that a balanced diet means\nthat the meals include proteins,\ncarbohydrates, fats and vitamins in\nrequisite proportions. Our Indian meal\nof roti\n/rice, dal  (pulses) and vegetables\nis a balanced meal. Milk is a balanced\nfood in itself. Fruits also provide\nnourishment. For infants, mother’s milk\nprovides all the nourishment that they\nneed.\nIron builds blood and iron-rich food\nsuch as leafy vegetables, jaggery, meat,\ncitrus, Indian gooseberry (amla)  are good\nfor adolescents.\nCheck items for lunch and dinner in\nyour meal. Is the meal balanced and\nnutritious? Does it include cereals\nwhich give energy and milk, meat, nuts\nand pulses which provide proteins for'),
 Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/scienc

In [32]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [34]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [35]:
### testing MMR search
question = "What is a balanced diet?"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf', 'page': 132}, page_content='and development. Hence the diet for an\n2018-19'),
 Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf', 'page': 133}, page_content='REACHING  THE AGE OF ADOLESCENCE 121adolescent has to be carefully planned.\nYou have alr eady lear nt what a balanced\ndiet is. Recall that a balanced diet means\nthat the meals include proteins,\ncarbohydrates, fats and vitamins in\nrequisite proportions. Our Indian meal\nof roti\n/rice, dal  (pulses) and vegetables\nis a balanced meal. Milk is a balanced\nfood in itself. Fruits also provide\nnourishment. For infants, mother’s milk\nprovides all the nourishment that they\nneed.\nIron builds blood and iron-rich food\nsuch as leafy vegetables, jaggery, meat,\ncitrus, Indian gooseberry (amla)  are good\nfor adolescents.\nCheck items for lunch and dinner in\nyour meal. Is the meal balanced and\nnutritious? Does it inc

In [36]:
### testing similarity search
question = "what is a balanced diet?"
vectordb.similarity_search(question, k = CFG.k)

[Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf', 'page': 132}, page_content='and development. Hence the diet for an\n2018-19'),
 Document(metadata={'source': '/kaggle/input/ncert-pdfs/ncert/science class 8.pdf', 'page': 133}, page_content='REACHING  THE AGE OF ADOLESCENCE 121adolescent has to be carefully planned.\nYou have alr eady lear nt what a balanced\ndiet is. Recall that a balanced diet means\nthat the meals include proteins,\ncarbohydrates, fats and vitamins in\nrequisite proportions. Our Indian meal\nof roti\n/rice, dal  (pulses) and vegetables\nis a balanced meal. Milk is a balanced\nfood in itself. Fruits also provide\nnourishment. For infants, mother’s milk\nprovides all the nourishment that they\nneed.\nIron builds blood and iron-rich food\nsuch as leafy vegetables, jaggery, meat,\ncitrus, Indian gooseberry (amla)  are good\nfor adolescents.\nCheck items for lunch and dinner in\nyour meal. Is the meal balanced and\nnutritious? Does it inc

In [37]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [38]:
def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [39]:
CFG.model_name

'llama2-13b-chat'

In [40]:
query = "What is a balanced diet?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

and development. Hence the diet for an
2018-19

REACHING  THE AGE OF ADOLESCENCE 121adolescent has to be carefully planned.
You have alr eady lear nt what a balanced
diet is. Recall that a balanced diet means
that the meals include proteins,
carbohydrates, fats and vitamins in
requisite proportions. Our Indian meal
of roti
/rice, dal  (pulses) and vegetables
is a balanced meal. Milk is a balanced
food in itself. Fruits also provide
nourishment. For infants, mother’s milk
provides all the nourishment that they
need.
Iron builds blood and iron-rich food
such as leafy vegetables, jaggery, meat,
citrus, Indian gooseberry (amla)  are good
for adolescents.
Check items for lunch and dinner in
your meal. Is the meal balanced and
nutritious? Does it include cereals
which give energy and m