In [1]:
! nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-9c389d99-8496-9668-b17d-e5fd9179bb75)


In [2]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain-community
! pip install -U langchain-huggingface
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding

! pip install -qq -U transformers
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes

clear_output()

CPU times: user 443 ms, sys: 70.1 ms, total: 513 ms
Wall time: 59 s


In [3]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 8.16 s, sys: 977 ms, total: 9.13 s
Wall time: 15.5 s


In [4]:
sorted(glob.glob('/content/anatomy_vol_*'))

['/content/anatomy_vol_1.pdf',
 '/content/anatomy_vol_2.pdf',
 '/content/anatomy_vol_3.pdf']

In [30]:
class CFG:
    # LLMs
    model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 6

    # paths
    PDFs_path = '/content/'
    Embeddings_path =  '/content/faiss-hp-sentence-transformers'
    Output_folder = './rag-vectordb'

In [6]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'wizardlm':
        model_repo = 'TheBloke/wizardLM-7B-HF'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True
        )

        max_len = 1024

    elif model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048

    elif model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,

            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048  #8192
        truncation=True,  # Explicitly enable truncation
        padding="max_len"  # Optional: pad to max_length

    elif model == 'mistral-7B':
        model_repo = 'mistralai/Mistral-7B-v0.1'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )

        max_len = 1024

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [7]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [8]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()

CPU times: user 51.5 s, sys: 58.8 s, total: 1min 50s
Wall time: 14min 29s


In [9]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [10]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'': 0}

In [11]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

  llm = HuggingFacePipeline(pipeline = pipe)


In [12]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7cca5035afe0>)

In [13]:
query = "what are the structural organization of a human body"
llm.invoke(query)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


"what are the structural organization of a human body?\n\nThe human body is made up of several organ systems, each with its own specific functions and structures. Here is an overview of the major structural organizations of the human body:\n\n1. Skeletal System: The skeletal system provides support and structure for the body, allowing for movement and protection of internal organs. It consists of bones, cartilage, and connective tissue.\n2. Muscular System: The muscular system allows for movement and maintains posture. There are three types of muscle tissue in the human body: skeletal muscle, smooth muscle, and cardiac muscle.\n3. Nervous System: The nervous system regulates the body's functions, such as movement, sensation, and thought processes. It consists of two main parts: the central nervous system (brain and spinal cord) and the peripheral nervous system (nerves).\n4. Circulatory System: The circulatory system transports oxygen and nutrients to cells throughout the body and remo

Langchain

In [14]:
CFG.model_name

'llama2-13b-chat'

Loader

In [33]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 3/3 [00:42<00:00, 14.15s/it]

CPU times: user 41.2 s, sys: 520 ms, total: 41.7 s
Wall time: 42.5 s





In [34]:
print(f'We have {len(documents)} pages in total')

We have 1433 pages in total


In [35]:
documents[8].page_content

"35()$&(\n:HOFRPH WR +XPDQ$QDWRP\\DQG3K\\VLRORJ\\ \x0f DQ 2SHQ6WD[ &ROOHJH UHVRXUFH\x11 :H FUHDWHG WKLV WH[WERRN ZLWK VHYHUDO JRDO V\nKHOSLQJ VWXGHQWVUHDFKKLJKOHYHOVRIDFDGHPLFVFKRODUVKLS\x11\n,QVWUXFWRUV DQG VWXGHQWV DOLNH ZLOO ILQG WKDW WKLV WH[WERRN RIIHUV D WKRUR XJK LQWURGXFWLRQ WR WKH FRQWHQW LQ DQ DFFHVVLEOH\nIRUPDW\x11\n$ERXW\x032SHQ6WD[\x03&ROOHJH\n2SHQ6WD[ &ROOHJH LV D QRQSURILW RUJDQL]DWLRQ FRPPLWWHG WR LPSURYLQJ VWXG HQW DFFHVV WR TXDOLW\\ OHDUQLQJ PDWHULDOV\x11 2XU\nIUHH WH[WERRNV DUH GHYHORSHG DQG SHHU\x10UHYLHZHG E\\ HGXFDWRUV WR HQVXUH WKD W WKH\\ DUH UHDGDEOH\x0f DFFXUDWH\x0f DQG RUJDQL]HG LQ\nV FROOHJH FR XUVHV\x11 8QOLNH WUDGLWLRQDO WH[WERRNV\x0f 2SHQ6WD[\n&ROOHJH UHVRXUFHV OLYH RQOLQH DQG DUH RZQHG E\\ WKH FRPPXQLW\\ RI HGXFDWRUV X VLQJ WKHP\x11 7KURXJK SDUWQHUVKLSV ZLWK\nFRPSDQLHVDQGIRXQGDWLRQVFRPPLWWHGWRUHGXFLQJFRVWVIRUVWXGHQWV\x0fZHDU HZRUNLQJWRLPSURYHDFFHVVWRKLJKHUHGXFDWLRQ\nIRU DOO\x11 2SHQ6WD[ &ROOHJH LV DQ LQLWLDWLYH RI 5LFH 8QLYHUVLW\\ DQG LV PDGH SR VVLEOH W

Splitter

In [36]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 6238 chunks from 1433 pages


Create Embeddings

In [37]:
%%time
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

vectordb = FAISS.from_documents(
    texts,
    HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
)

### persist vector database
vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

clear_output()

CPU times: user 2min 36s, sys: 1.01 s, total: 2min 37s
Wall time: 2min 45s


In [38]:
### test if vector DB was loaded correctly
vectordb.similarity_search('BERT')

[Document(metadata={'source': '/content/anatomy_vol_1.pdf', 'page': 471}, page_content='EHWD\x03FHOO\x0f \x1a\x14\x1b\x0f\x1a\x15\x1a\n%HW]\x03FHOOV\x0f \x18\x1c\x18\x0f\x19\x13\x13\nEL\x0f\x17\x15\x16\x0f\x17\x18\x19\nELD[LDO\x03MRLQW\x0f \x16\x16\x15\x0f\x16\x19\x19\nELFHSV\x03EUDFKLL\x0f \x17\x17\x16\x0f\x17\x18\x19\nELFHSV\x03IHPRULV\x0f \x17\x18\x15\x0f\x17\x18\x19\nELFLSLWDO\x03JURRYH\x0f \x15\x1c\x15\x0f\x16\x14\x19\nELFXVSLG\x03YDOYH\x0f \x1a\x1c\x13\x0f\x1b\x15\x1b\n%LOH\x0f\x14\x13\x18\x1b\nELOH\x0f\x14\x13\x1a\x13\nELOH\x03FDQDOLFXOXV\x0f \x14\x13\x18\x1a\x0f\x14\x13\x1a\x13\nELOH\x03VDOWV\x0f \x14\x13\x1c\x1b\x0f\x14\x14\x14\x1c\nELOLUXELQ\x0f \x1a\x17\x1c\x0f\x1a\x19\x1b\x0f\x14\x13\x1a\x13\n%LOLUXELQ\x0f \x14\x13\x18\x1b\nELOLYHUGLQ\x0f \x1a\x17\x1c\x0f\x1a\x19\x1b\nELQRFXODU\x03GHSWK\x03FXHV\x0f \x18\x1b\x1c\x0f\x19\x13\x13\nELRJHQLF\x03DPLQH\x0f \x17\x1c\x1c\x0f\x18\x13\x16\nELRV\\QWKHVLV\x03UHDFWLRQV\x0f \x14\x13\x1b\x15\x0f\n\x14\x14\x14\x1cELSHQQDWH\x0f \x17\x14\x1c\

Prompt Template

In [40]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

Retriever chain

In [41]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever,
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [43]:
question = "what are the structural organization of a human body"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(metadata={'source': '/content/anatomy_vol_3.pdf', 'page': 2}, page_content='Table of Contents\nPreface  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 7\nUnit 1: Levels of Organization\nChapter 1: An Introduction to the Human Body  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 15\n1.1 Overview of Anatomy and Physiology .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 16\n1.2 Structural Organization of the Human Body .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 17\n1.3 Functions of Human Life  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 21'),
 Document(metadata={'source': '/content/anatomy_vol_2.pdf', 'page': 324}, page_content='structure is a living, sophisticated muscle. As you read this chapter, try to keep these twin concepts in mind

In [47]:
### testing similarity search
question = "what are the structural organization of a human body"
vectordb.similarity_search(question, k = CFG.k)

[Document(metadata={'source': '/content/anatomy_vol_3.pdf', 'page': 2}, page_content='Table of Contents\nPreface  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 7\nUnit 1: Levels of Organization\nChapter 1: An Introduction to the Human Body  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 15\n1.1 Overview of Anatomy and Physiology .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 16\n1.2 Structural Organization of the Human Body .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 17\n1.3 Functions of Human Life  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 21'),
 Document(metadata={'source': '/content/anatomy_vol_2.pdf', 'page': 9}, page_content='of the body. They also focus particularly on how the body’s regions, important chemicals, and cells maintain hom

Post-process outputs

In [45]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [46]:
def llm_ans(query):
    start = time.time()

    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)

    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [48]:

query =question = "what are the structural organization of a human body"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

Table of Contents
Preface  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 7
Unit 1: Levels of Organization
Chapter 1: An Introduction to the Human Body  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 15
1.1 Overview of Anatomy and Physiology .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 16
1.2 Structural Organization of the Human Body .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 17
1.3 Functions of Human Life  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 21

of the body. They also focus particularly on how the body’s regions, important che

Gradio Chat UI (Inspired from HinePo)

In [50]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [51]:
! pip install --upgrade gradio -qq
clear_output()

In [52]:
import gradio as gr
print(gr.__version__)

4.42.0


In [53]:
 def predict(message, history):
     # output = message # debug mode

     output = str(llm_ans(message)).replace("\n", "<br/>")
     return output

 demo = gr.ChatInterface(
     predict,
     title = f' Open-Source LLM ({CFG.model_name})  Question Answering'
 )

 demo.queue()
 demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b139e899ab46dc6b19.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


