In [None]:
# @title Installing Relevant Libraries
! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding

! pip install -qq -U transformers
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes


In [None]:
# @title Importing Libraries
import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

import requests

In [None]:
!ls

ConceptsofBiology.pdf


In [None]:
# @title Downloading the PDF File
url = "https://assets.openstax.org/oscms-prodcms/media/documents/ConceptsofBiology-WEB.pdf?_gl=1*q17i04*_ga*NTM1NzU5NTk1LjE3MTM2MzYzNTY.*_ga_T746F8B0QC*MTcxMzYzNjM1Ni4xLjEuMTcxMzYzNjU4Mi42MC4wLjA."
response = requests.get(url)
filename = "ConceptsofBiology.pdf"

if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
else:
      print(f"Failed to download the file. Status code: {response.status_code}")


The file has been downloaded and saved as ConceptsofBiology.pdf


In [None]:
# @title Creating Class with multiple models for organized experiments
class CFG:
    # LLMs
    model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    temperature = 0.2
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 512
    split_overlap = 50

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 6

    # paths
    PDFs_path = 'ConceptsofBiology.pdf'
    Embeddings_path =  '/faiss-sentence-transformers'
    Output_folder = './vectordb'

In [None]:
# @title Define Model
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'wizardlm':
        model_repo = 'TheBloke/wizardLM-7B-HF'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True
        )

        max_len = 1024

    elif model == 'llama2-7b-chat':
        model_repo = 'daryl149/llama-2-7b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048

    elif model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048 # 8192

    elif model == 'mistral-7B':
        model_repo = 'mistralai/Mistral-7B-v0.1'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )

        max_len = 1024

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [None]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)


Downloading model:  llama2-13b-chat 




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

CPU times: user 34.9 s, sys: 27.2 s, total: 1min 2s
Wall time: 2min 41s


In [None]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [None]:
# @title Building Hugging face Pipeline
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
%%time
### testing model, not using the Biology book yet
### answer is not necessarily related to Biology
query = "what are The building block of the molecules"
llm.invoke(query)



CPU times: user 18.1 s, sys: 0 ns, total: 18.1 s
Wall time: 18.1 s


'what are The building block of the molecules that make up living organisms?\nWhat is the process by which cells produce energy for the body?\nWhat is the scientific term for the study of the structure, behavior, and evolution of the universe?\nWhat is the term for a group of organisms that can interbreed to produce fertile offspring?\nWhat is the term for the variety of species found in an ecosystem?\nWhat is the term for the movement of water through a plant, from the roots to the leaves?\nWhat is the term for the process by which plants convert sunlight into chemical energy?\nWhat is the term for the process by which animals obtain food by hunting and consuming other organisms?\nWhat is the term for the process by which organisms adapt to their environment over time?\nWhat is the term for the process by which genetic information is passed from one generation to the next?'

In [None]:
# @title Extracting Text from PDF
%%time

loader = PyPDFLoader(filename)

documents = loader.load()

CPU times: user 15.7 s, sys: 31.2 ms, total: 15.7 s
Wall time: 15.7 s


In [None]:
print(f'We have {len(documents)} pages in total')

We have 615 pages in total


In [None]:
documents[18].page_content

'INTR ODUCT IONCHAP TER 1\nIntroduction t o Biolog y\n1.1Themes and Conc epts of Biolog y\n1.2The P rocess of Scienc e\nViewed fr om spac e, Ear th (Figure 1.1 ) offers f ew clues about the div ersity o f life\nforms that r eside ther e. The firs t forms o f life on Ear th ar e thought t o ha ve been micr oorganisms\nthat e xisted for bil lions o f years bef ore plants and animals appear ed. The mammals , birds, and\nflowers so familiar t o us ar e all relativ ely recent, originating 130 t o 200 mil lion y ears ag o. Humans\nhave inhabit ed this planet f or onl y the las t 2.5 mil lion y ears , and onl y in the las t 300,000 y ears\nhave humans s tarted looking lik e we do t oday.\n1.1Themes and C oncepts of Biology\nLEARNING OB JECTIVE S\nBy the end o f this section, y ou wil l be able t o:\n•Identif y and describe the pr oper ties o f life\n•Describe the le vels o f organization among living things\n•List examples o f diff erent sub disciplines in biolog y\nBiolog yis the scienc e th

In [None]:
# @title Creating Chunks from the PDF for RAG Retrieval
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap,
    add_start_index=True,
    strip_whitespace=True,
    separators=MARKDOWN_SEPARATORS,
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 4027 chunks from 615 pages


In [None]:
# @title Storing the Embeddings in FAISS Vector Db
%%time

### we create the embeddings only if they do not exist yet
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):

    ### download embeddings model
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts,
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 6.18 s, sys: 94.2 ms, total: 6.28 s
Wall time: 6.17 s


In [None]:
# @title Semantic Similarity Check
### test if vector DB was loaded correctly
vectordb.similarity_search('lipids')

[Document(page_content='with a c ovalent bond ( Figure 2.18 ).\nFIGURE 2.18 Lipids include fats , such as trigl ycerides , which ar e made up o f fatty acids and gl ycerol, phospholipids , and s teroids .\nDuring this c ovalent bond f ormation, thr ee w ater molecules ar e released. The thr ee fat ty acids in the fat ma y be\nsimilar or dis similar . These fats ar e also cal ledtrigl ycerides because the y ha ve thr ee fat ty acids . Some fat ty acids2.3 • Biologic al Molecules 43', metadata={'source': 'ConceptsofBiology.pdf', 'page': 56, 'start_index': 863}),
 Document(page_content='constituent o f the plasma membr ane. Lipids include fats , oils , waxes, phospholipids , and s teroids .\nFIGURE 2.17 Hydrophobic lipids in the fur o f aquatic mammals , such as this riv er ot ter, protect them fr om the elements . (credit: K en\nBosma)\nAfatmolecule , such as a trigl yceride , consis ts of two main c omponents —glycerol and fat ty acids . Glycerol is an\norganic c ompound with thr ee car

In [None]:
# @title Prompt Designing with Context and Question
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [None]:
# @title Retrieval Chain
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever,
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [None]:
# @title Testing RAG Similarity Search
### testing MMR search
question = "What are the building block of the molecules"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(page_content='and eight neutr ons. Ther efore, it has a mas s number o f 14 (six pr otons and eight neutr ons) and an at omic number o f\n6, meaning it is s till the element carbon. These tw o alternat e forms o f carbon ar e isot opes . Some isot opes ar e\nunstable and wil l lose pr otons, other subat omic par ticles , or ener gy to form mor e stable elements . These ar e cal led\nradioactiv e iso topes or radioisot opes .2.1 • The Building Block s of Molecules 29', metadata={'source': 'ConceptsofBiology.pdf', 'page': 42}),
 Document(page_content='water w ould be a g as rather than a liquid at r oom t emper atur e.\nFIGURE 2.7Hydrogen bonds f orm betw een slightl y positiv e (δ+) and slightl y neg ative (δ–) char ges o f polar c ovalent molecules , such as\nwater.\nHydrogen bonds can f orm betw een diff erent molecules and the y do not al ways ha ve to include a w ater molecule .\nHydrogen at oms in polar bonds within an y molecule can f orm bonds with other adjac ent molec

**Post-process outputs**



*   Format llm response
*   Cite sources (PDFs)
*   Change width parameter to format the output


In [None]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [None]:
def llm_ans(query):
    start = time.time()

    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)

    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

**Ask questions**

*   Question Answering from PDF
*   Invoke QA Chain

In [None]:
query = "What are the building block of the molecules"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

and eight neutr ons. Ther efore, it has a mas s number o f 14 (six pr otons and eight neutr ons) and an at omic number o f
6, meaning it is s till the element carbon. These tw o alternat e forms o f carbon ar e isot opes . Some isot opes ar e
unstable and wil l lose pr otons, other subat omic par ticles , or ener gy to form mor e stable elements . These ar e cal led
radioactiv e iso topes or radioisot opes .2.1 • The Building Block s of Molecules 29

acids link ed to a gl ycerol molecule
unsa turated fa tty acid a long-chain h ydrocarbon that
has one or mor e than one double bonds in the
hydrocarbon chain
van der W aals int eraction a weak at traction or
interaction betw een molecules caused b y slightl y
positiv ely char ged or slightl y neg atively char ged
atoms
Chap ter Summa

In [None]:
query = "How atoms and molecules are related?"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

2.1The Building Block s of Molecules
LEARNING OB JECTIVE S
By the end o f this section, y ou wil l be able t o:
•Describe mat ter and elements
•Describe the int errelationship betw een pr otons, neutr ons, and electr ons, and the w ays in
which electr ons can be donat ed or shar ed betw een at oms
At its mos t fundamental le vel, life is made up o f mat ter.Matteroccupies spac e and has mas s. All
matter is c omposed o felements , subs tanc es that cannot be br oken do wn or tr ansformed
chemical ly int o other subs tanc es. Each element is made o f atoms , each with a c onstant number o f
protons and unique pr oper ties. A total o f 118 elements ha ve been defined; ho wever, onl y 92 oc cur

acids link ed to a gl ycerol molecule
unsa turated fa tty acid a long-chain h ydrocarbon

In [None]:
query = "Does molecules have nucleus?"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

nucleus .
Neutr ons, like protons, reside in the nucleus o f an at om. The y ha ve a mas s of 1 and no char ge.
The positiv e (pr otons) and neg ative (electr ons) char ges balanc e each other in a neutr al at om,
which has a net z ero char ge.28 2 • Chemis try of Life
Access f or free at opens tax.org

proteins , pol ysaccharides , lipids , nucleic acids , and
even w orn-out or ganel les
micr oscope the ins trument that magnifies an object
mitochondria (sing ular: mit ochondrion) the c ellular
organel les r esponsible f or carr ying out c ellular
respir ation, r esul ting in the pr oduction o f ATP, the
cell’s main ener gy-carr ying molecule
nuclear en velope the double -membr ane s tructur e
that c onstitut es the out ermos t por tion o f the
nucleus
nucleolus the darkl y stain

In [None]:
query = "How molecules are made?"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

2.1The Building Block s of Molecules
LEARNING OB JECTIVE S
By the end o f this section, y ou wil l be able t o:
•Describe mat ter and elements
•Describe the int errelationship betw een pr otons, neutr ons, and electr ons, and the w ays in
which electr ons can be donat ed or shar ed betw een at oms
At its mos t fundamental le vel, life is made up o f mat ter.Matteroccupies spac e and has mas s. All
matter is c omposed o felements , subs tanc es that cannot be br oken do wn or tr ansformed
chemical ly int o other subs tanc es. Each element is made o f atoms , each with a c onstant number o f
protons and unique pr oper ties. A total o f 118 elements ha ve been defined; ho wever, onl y 92 oc cur

compounds can be made ( Figure 2.14 a). The carbon at oms ma y bond with at oms o f othe

In [None]:
query = "Generate a response of 1000 words on carbon dating"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

geological ly than the mo vement betw een living or ganisms . Carbon is s tored for long periods in what ar e kno wn as
carbon r eser voirs, which include the atmospher e, bodies o f liquid w ater (mos tly oc eans), oc ean sediment, soil ,
rocks (including f ossil fuels), and Ear th’s int erior .
As s tated, the atmospher e is a major r eser voir o f carbon in the f orm o f carbon dio xide that is es sential t o the536 20 • E cosystems and the Biospher e
Access f or free at opens tax.org

VISU AL C ONNE CTION
FIGURE 2.3Arranged in c olumns and r ows based on the char acteristics o f the elements , the periodic table pr ovides k ey inf ormation about
the elements and ho w the y might int eract with each other t o form molecules . Mos t periodic tables pr ovide a k ey or leg end t 

In [None]:
#phylogeny

query = "what is phylogeny ?"
print(llm_ans(query))




Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

obvious w ay with the leas t number o f steps
molecular s ystema tics the methods o f using
molecular e videnc e to identif y ph ylog eneticrelationships
monoph yletic gr oup (also , clade) or ganisms that
shar e a single anc estor
order the cat egory in the tax onomic clas sification
system that fal ls within clas s and includes families
phylog enetic tr ee diagr am used t o reflect the
evolutionar y relationships betw een or ganisms or
groups o f organisms
phylog enyevolutionar y his tory and r elationship o f an
organism or gr oup o f organisms
phylum the cat egory in the tax onomic clas sification
system that fal ls within king dom and includes
clas ses
rooted describing a ph ylog enetic tr ee with a single
ancestral lineag e to which al l organisms

organization in living th

In [None]:
from google.cloud import storage
storage_client = storage.Client()

buckets = list(storage_client.list_buckets())

bucket = storage_client.get_bucket("object-detection-yolov8") # your bucket name

blob = bucket.blob('vectorstore/db_faiss/index.faiss')
blob.upload_from_filename('vectorstore/db_faiss/index.faiss')


In [None]:
from google.cloud import storage
storage_client = storage.Client()

buckets = list(storage_client.list_buckets())

bucket = storage_client.get_bucket("object-detection-yolov8") # your bucket name

blob = bucket.blob('vectorstore/db_faiss/index.pkl')
blob.upload_from_filename('vectorstore/db_faiss/index.pkl')