### Importing libraries

In [1]:
import os
import fitz
import pprint
import torch
import pandas as pd
from tqdm.auto import tqdm

### Going thorugh the text of the book and storing in a dataframe

In [2]:
pdf_path = 'Human-Nutrition-2020-Edition-1598491699.pdf'

# Using pymupdf library to read the pdf file.
doc = fitz.open(pdf_path)

book_content = []
for page_number, page_content in tqdm(enumerate(doc), total=len(doc)):

    # Extracting text from page
    page_text = page_content.get_text()

    # removing newline characters
    page_text = page_text.replace('\n', ' ').strip()

    # creating a dictionary for each page to convert it into a dataframe later. Could also use a list of tuples bus dictionaries are more readable.
    book_content.append({
        "page_number" : page_number+1,
        "text": page_text,
        "tokens": len(page_text)/ 4
    })

  0%|          | 0/1208 [00:00<?, ?it/s]

### Creating the dataframe.

In [3]:
data = pd.DataFrame(book_content)
data

Unnamed: 0,page_number,text,tokens
0,1,Human Nutrition: 2020 Edition,7.25
1,2,,0.00
2,3,Human Nutrition: 2020 Edition UNIVERSITY OF ...,80.00
3,4,Human Nutrition: 2020 Edition by University of...,53.00
4,5,Contents Preface University of Hawai‘i at Mā...,199.25
...,...,...,...
1203,1204,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...,419.00
1204,1205,Images / Pixabay License; “Pumpkin Cartoon Ora...,404.25
1205,1206,Flashcard Images Note: Most images in the fla...,428.75
1206,1207,ShareAlike 11. Organs reused “Pancreas Organ ...,433.25


### Removing the pages having tokens less than 30

In [4]:
data = data[data['tokens']>30].copy()
data.reset_index(drop = True, inplace = True)
data

Unnamed: 0,page_number,text,tokens
0,3,Human Nutrition: 2020 Edition UNIVERSITY OF ...,80.00
1,4,Human Nutrition: 2020 Edition by University of...,53.00
2,5,Contents Preface University of Hawai‘i at Mā...,199.25
3,6,Lifestyles and Nutrition University of Hawai‘...,244.00
4,7,The Cardiovascular System University of Hawai...,259.25
...,...,...,...
1133,1204,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...,419.00
1134,1205,Images / Pixabay License; “Pumpkin Cartoon Ora...,404.25
1135,1206,Flashcard Images Note: Most images in the fla...,428.75
1136,1207,ShareAlike 11. Organs reused “Pancreas Organ ...,433.25


### Using spacy to breakdown the paragraphs into sentences.

In [5]:
from spacy.lang.en import English
# Using spacy to split the text into sentences.

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x795253f10300>

In [6]:
data['sentences'] = data['text'].apply(lambda x: [sent.text for sent in nlp(x).sents])
data['num_sentences'] = data['sentences']. apply(len)

In [7]:
data

Unnamed: 0,page_number,text,tokens,sentences,num_sentences
0,3,Human Nutrition: 2020 Edition UNIVERSITY OF ...,80.00,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
1,4,Human Nutrition: 2020 Edition by University of...,53.00,[Human Nutrition: 2020 Edition by University o...,1
2,5,Contents Preface University of Hawai‘i at Mā...,199.25,[Contents Preface University of Hawai‘i at M...,2
3,6,Lifestyles and Nutrition University of Hawai‘...,244.00,[Lifestyles and Nutrition University of Hawai...,3
4,7,The Cardiovascular System University of Hawai...,259.25,[The Cardiovascular System University of Hawa...,1
...,...,...,...,...,...
1133,1204,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...,419.00,"[39., Exercise 10.2 & 11.3 reused “Egg Oval Fo...",18
1134,1205,Images / Pixabay License; “Pumpkin Cartoon Ora...,404.25,[Images / Pixabay License; “Pumpkin Cartoon Or...,10
1135,1206,Flashcard Images Note: Most images in the fla...,428.75,[Flashcard Images Note: Most images in the fl...,13
1136,1207,ShareAlike 11. Organs reused “Pancreas Organ ...,433.25,"[ShareAlike 11., Organs reused “Pancreas Orga...",13


### Chunking sentences, each chunk will have 10 senetences.

In [8]:
chunk_size = 10

def chunk_sentences(sentences, chunk_size):
    return [''.join(sentences[i:i+ chunk_size]) for i in range(0, len(sentences),chunk_size//2)]

data['text_chunks'] = data['sentences'].apply(lambda x: chunk_sentences(x, chunk_size))
data['num_chunks'] = data['text_chunks'].apply(len)
data

Unnamed: 0,page_number,text,tokens,sentences,num_sentences,text_chunks,num_chunks
0,3,Human Nutrition: 2020 Edition UNIVERSITY OF ...,80.00,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
1,4,Human Nutrition: 2020 Edition by University of...,53.00,[Human Nutrition: 2020 Edition by University o...,1,[Human Nutrition: 2020 Edition by University o...,1
2,5,Contents Preface University of Hawai‘i at Mā...,199.25,[Contents Preface University of Hawai‘i at M...,2,[Contents Preface University of Hawai‘i at M...,1
3,6,Lifestyles and Nutrition University of Hawai‘...,244.00,[Lifestyles and Nutrition University of Hawai...,3,[Lifestyles and Nutrition University of Hawai...,1
4,7,The Cardiovascular System University of Hawai...,259.25,[The Cardiovascular System University of Hawa...,1,[The Cardiovascular System University of Hawa...,1
...,...,...,...,...,...,...,...
1133,1204,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...,419.00,"[39., Exercise 10.2 & 11.3 reused “Egg Oval Fo...",18,[39.Exercise 10.2 & 11.3 reused “Egg Oval Food...,4
1134,1205,Images / Pixabay License; “Pumpkin Cartoon Ora...,404.25,[Images / Pixabay License; “Pumpkin Cartoon Or...,10,[Images / Pixabay License; “Pumpkin Cartoon Or...,2
1135,1206,Flashcard Images Note: Most images in the fla...,428.75,[Flashcard Images Note: Most images in the fl...,13,[Flashcard Images Note: Most images in the fl...,3
1136,1207,ShareAlike 11. Organs reused “Pancreas Organ ...,433.25,"[ShareAlike 11., Organs reused “Pancreas Orga...",13,[ShareAlike 11.Organs reused “Pancreas Organ ...,3


### Creating the vector database with chunks

In [9]:
vector_db = data['text_chunks'].explode().reset_index(drop=True).to_frame()
vector_db

Unnamed: 0,text_chunks
0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
1,Human Nutrition: 2020 Edition by University of...
2,Contents Preface University of Hawai‘i at Mā...
3,Lifestyles and Nutrition University of Hawai‘...
4,The Cardiovascular System University of Hawai...
...,...
2947,Hazard Analysis Critical Control Points reused...
2948,ShareAlike 11.Organs reused “Pancreas Organ A...
2949,Protein reused “The Macronutrients: Carbohydra...
2950,Sucrose reused “Figure 03 02 05” by OpenStax B...


### Generating embeddings for each chunk

In [12]:
from sentence_transformers import SentenceTransformer, util

In [14]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cuda")

text_chunk_embeddings = embedding_model.encode(vector_db['text_chunks'].tolist(),
                                               batch_size=16, 
                                               convert_to_tensor=True)

In [15]:
vector_db['embeddings'] = text_chunk_embeddings.tolist()
vector_db

Unnamed: 0,text_chunks,embeddings
0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,"[0.06742425262928009, 0.0902281105518341, -0.0..."
1,Human Nutrition: 2020 Edition by University of...,"[0.05521562322974205, 0.05921393632888794, -0...."
2,Contents Preface University of Hawai‘i at Mā...,"[0.02798021212220192, 0.033981405198574066, -0..."
3,Lifestyles and Nutrition University of Hawai‘...,"[0.06825671344995499, 0.038127463310956955, -0..."
4,The Cardiovascular System University of Hawai...,"[0.03302641957998276, -0.008497660979628563, 0..."
...,...,...
2947,Hazard Analysis Critical Control Points reused...,"[0.03347204625606537, -0.05704408511519432, 0...."
2948,ShareAlike 11.Organs reused “Pancreas Organ A...,"[0.07705153524875641, 0.009785559959709644, -0..."
2949,Protein reused “The Macronutrients: Carbohydra...,"[0.11963801831007004, 0.018848083913326263, -0..."
2950,Sucrose reused “Figure 03 02 05” by OpenStax B...,"[0.10304515808820724, -0.016470178961753845, 0..."


In [None]:
# Saving the file
vector_db.to_pickle('vector_db.pkl')

### Retrieval using embeddings

In [18]:
query = "What are Proteins and Carbohydrates?"

query_embedding = embedding_model.encode(query, 
                                           convert_to_tensor=True, 
                                           device="cuda")

In [20]:
dot_prod = util.dot_score(query_embedding, text_chunk_embeddings)[0]

top_results = torch.topk(dot_prod, k=5)
top_results

torch.return_types.topk(
values=tensor([0.7134, 0.6359, 0.6320, 0.6303, 0.6234], device='cuda:0'),
indices=tensor([  53,   46, 1003,  909,  908], device='cuda:0'))

In [21]:
vector_db.iloc[top_results.indices.cpu()]['text_chunks'].values

array(['Figure 1.1 The  Macronutrie nts:  Carbohydrat es, Lipids,  Protein, and  Water  Proteins  Proteins are macromolecules composed of chains of subunits called  amino acids.Amino acids are simple subunits composed of carbon,  oxygen, hydrogen, and nitrogen.Food sources of proteins include  meats, dairy products, seafood, and a variety of different plant- based foods, most notably soy.The word protein comes from a  Greek word meaning “of primary importance,” which is an apt  description of these macronutrients; they are also known  colloquially as the “workhorses” of life.Proteins provide four  kilocalories of energy per gram; however providing energy is not  protein’s most important function.Proteins provide structure to  bones, muscles and skin, and play a role in conducting most of the  chemical reactions that take place in the body.Scientists estimate  that greater than one-hundred thousand different proteins exist  within the human body.The genetic codes in DNA are basically  p

### Augmenting the original text with the retrieved chunks

In [23]:
context = "- " +"\n- ".join(vector_db.iloc[top_results.indices.tolist()]['text_chunks'].values)

query_to_ask = f"Given the context :\n{context}\n\nAnswer to the point and you take reference provided context. \n\nAnswer the below query:\n{query} \n\nAnswer:"

print(query_to_ask)

Given the context :
- Figure 1.1 The  Macronutrie nts:  Carbohydrat es, Lipids,  Protein, and  Water  Proteins  Proteins are macromolecules composed of chains of subunits called  amino acids.Amino acids are simple subunits composed of carbon,  oxygen, hydrogen, and nitrogen.Food sources of proteins include  meats, dairy products, seafood, and a variety of different plant- based foods, most notably soy.The word protein comes from a  Greek word meaning “of primary importance,” which is an apt  description of these macronutrients; they are also known  colloquially as the “workhorses” of life.Proteins provide four  kilocalories of energy per gram; however providing energy is not  protein’s most important function.Proteins provide structure to  bones, muscles and skin, and play a role in conducting most of the  chemical reactions that take place in the body.Scientists estimate  that greater than one-hundred thousand different proteins exist  within the human body.The genetic codes in DNA ar

### Generating the answer to our question.

### Using model MS Phi2

In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [25]:
# Tokenize
inputs = tokenizer(query_to_ask, return_tensors="pt").to("cuda")

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
pprint.pprint(response,width=150)

('Given the context :\n'
 '- Figure 1.1 The  Macronutrie nts:  Carbohydrat es, Lipids,  Protein, and  Water  Proteins  Proteins are macromolecules composed of chains of '
 'subunits called  amino acids.Amino acids are simple subunits composed of carbon,  oxygen, hydrogen, and nitrogen.Food sources of proteins include  '
 'meats, dairy products, seafood, and a variety of different plant- based foods, most notably soy.The word protein comes from a  Greek word meaning '
 '“of primary importance,” which is an apt  description of these macronutrients; they are also known  colloquially as the “workhorses” of '
 'life.Proteins provide four  kilocalories of energy per gram; however providing energy is not  protein’s most important function.Proteins provide '
 'structure to  bones, muscles and skin, and play a role in conducting most of the  chemical reactions that take place in the body.Scientists '
 'estimate  that greater than one-hundred thousand different proteins exist  within the human b

In [28]:
def ask(query,model,tokenizer):
    query_embedding = embedding_model.encode(query, 
                                               convert_to_tensor=True, 
                                               device="cuda")

    dot_scores = util.dot_score(query_embedding, text_chunk_embeddings)[0]
    top_results = torch.topk(dot_scores, k=5)

    context = "- " + "\n- ".join(vector_db.iloc[top_results.indices.tolist()]['text_chunks'].values)

    query_to_ask = f"Given the context :\n{context}\n\nAnswer to the point and you take reference provided context. \n\nAnswer the below query:\n{query} \n\nAnswer:"

    inputs = tokenizer(query_to_ask, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [31]:
pprint.pprint(ask("What do Carbohydrates do in our body?", model, tokenizer),width=150)

('Given the context :\n'
 '- The Functions of  Carbohydrates in the Body  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION '
 'PROGRAM  There are five primary functions of carbohydrates in the human  body.They are energy production, energy storage, building  '
 'macromolecules, sparing protein, and assisting in lipid metabolism. Energy Production  The primary role of carbohydrates is to supply energy to '
 'all cells  in the body.Many cells prefer glucose as a source of energy versus  other compounds like fatty acids.Some cells, such as red blood  '
 'cells, are only able to produce cellular energy from glucose.The  brain is also highly sensitive to low blood-glucose levels because  it uses only '
 'glucose to produce energy and function (unless under  extreme starvation conditions).About 70 percent of the glucose  entering the body from '
 'digestion is redistributed (by the liver) back  into the blood for use by other tissues.Cells that req

In [35]:
pprint.pprint(ask("What is the powerhouse of cell?", model, tokenizer),width=150)

('Given the context :\n'
 '- molecule\xa0water (H2O).Molecules bond together to make bigger  macromolecules.The carbon atom is often referred to as the  backbone of life '
 'because it can readily bond with four other  elements to form long chains and more complex macromolecules. Four macromolecules—carbohydrates, '
 'lipids, proteins, and nucleic  acids—make up all of the structural and functional units of cells. Although we defined the cell as the “most basic” '
 'unit of life,  it is structurally and functionally complex\xa0(Figure 2.2 “The Cell  Structure”).\xa0A cell can be thought of as a mini-organism '
 'consisting  of tiny organs called organelles.The organelles are structural and  functional units constructed from several macromolecules bonded  '
 'together.A typical animal cell contains the following organelles:  the nucleus (which houses the genetic material DNA), mitochondria  (which '
 'generate energy), ribosomes (which produce protein), the  endoplasmic reticulum (which