In [13]:
import fitz
import os
import numpy as np
from tqdm.auto import tqdm
import re
import torch
import pandas as pd
import gc
torch.cuda.empty_cache()
gc.collect()


device = 'cuda' if torch.cuda.is_available else "cpu"

print(f"[INFO] using {device}")

[INFO] using cuda


In [14]:
def text_formater(text:str) -> str:
    return_text = text.replace("\n"," ")
    return_text = return_text.lower()
    return return_text

def open_and_clean_pdf(path1:str,pages_and_text:list,page_number=0) :
    pdf = fitz.open(path1)
    for pages_no,pages_text in tqdm(enumerate(pdf)):
        text = pages_text.get_text()
        text = text_formater(text)
        pages_and_text.append( {
            "page_number":pages_no - page_number,
            "page_char_count": len(text),
            "number_of_tokens":len(text)/4,
            "pages_sentence_count":len(text.split(".")),
             "page_text":text,
            "Book_Name":path1.replace(".pdf"," ")
        })
        

In [15]:
pages_and_text = []

In [16]:
open_and_clean_pdf("A Brief History of Time.pdf",pages_and_text,7)

0it [00:00, ?it/s]

In [17]:
open_and_clean_pdf("The Universe in a Nutshell.pdf",pages_and_text,11)

0it [00:00, ?it/s]

In [18]:
len(pages_and_text)

404

In [19]:
import pandas as pd
df = pd.DataFrame(pages_and_text)

df.describe().round(2)

Unnamed: 0,page_number,page_char_count,number_of_tokens,pages_sentence_count
count,404.0,404.0,404.0,404.0
mean,92.05,1468.07,367.02,12.6
std,59.36,931.3,232.83,7.48
min,-11.0,0.0,0.0,1.0
25%,41.0,644.0,161.0,7.0
50%,91.5,1457.5,364.38,14.0
75%,142.0,2465.5,616.38,18.0
max,207.0,3042.0,760.5,34.0


In [20]:
import random

random.sample(pages_and_text,k=5)

[{'page_number': -4,
  'page_char_count': 1282,
  'number_of_tokens': 320.5,
  'pages_sentence_count': 26,
  'page_text': 'a brief history of time a bantam book publishing history bantam illustrated hardcover edition published november 1996 bantam hardcover edition/september 1998 bantam trade paperback edition/september 1998 all rights reserved. copyright © 1988, 1996 by stephen hawking illustrations copyright © 1988 by ron miller book design by glen m. edelstein no part of this book may be reproduced or transmitted in any form or by any means, electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from the publisher. for information address: bantam books. library of congress cataloging-in-publication data hawking, s. w. (stephen w.) a brief history of time / stephen hawking. p.   cm. includes index. eisbn: 978-0-55389692-3 1. cosmology. i. title. qb981.h377   1998 523.1—dc21      98-21874 bantam bo

In [21]:
from spacy.lang.en import English

nlp = English()

In [22]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x277ff064740>

In [23]:
for item in tqdm(pages_and_text):
    page_text = item['page_text']
    item['pages_sentences'] = page_text.split('.')
    doc = nlp(page_text)
    spacy_sentences = list(doc.sents)
    item['pages_sentences_spacy'] = [str(sen) for sen in spacy_sentences]
    item['spacy_sentence_count'] = len(item['pages_sentences_spacy'])

  0%|          | 0/404 [00:00<?, ?it/s]

In [24]:
pages_and_text[12]

{'page_number': 5,
 'page_char_count': 2617,
 'number_of_tokens': 654.25,
 'pages_sentence_count': 16,
 'page_text': 'motionless. would they not all fall together at some point? in a letter in 1691 to richard bentley, another leading thinker of his day, newton argued that this would indeed happen if there were only a finite number of stars distributed over a finite region of space. but he reasoned that if, on the other hand, there were an infinite number of stars, distributed more or less uniformly over infinite space, this would not happen, because there would not be any central point for them to fall to. this argument is an instance of the pitfalls that you can encounter in talking about infinity. in an infinite universe, every point can be regarded as the center, because every point has an infinite number of stars on each side of it. the correct approach, it was realized only much later, is to consider the finite situation, in which the stars all fall in on each other, and then to a

In [25]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,number_of_tokens,pages_sentence_count,spacy_sentence_count
count,404.0,404.0,404.0,404.0,404.0
mean,92.05,1468.07,367.02,12.6,12.36
std,59.36,931.3,232.83,7.48,7.44
min,-11.0,0.0,0.0,1.0,0.0
25%,41.0,644.0,161.0,7.0,6.0
50%,91.5,1457.5,364.38,14.0,14.0
75%,142.0,2465.5,616.38,18.0,18.0
max,207.0,3042.0,760.5,34.0,31.0


In [26]:
num_sentence_chunk_size = 10 

def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["pages_sentences_spacy"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/404 [00:00<?, ?it/s]

In [27]:
pages_and_text[12]

{'page_number': 5,
 'page_char_count': 2617,
 'number_of_tokens': 654.25,
 'pages_sentence_count': 16,
 'page_text': 'motionless. would they not all fall together at some point? in a letter in 1691 to richard bentley, another leading thinker of his day, newton argued that this would indeed happen if there were only a finite number of stars distributed over a finite region of space. but he reasoned that if, on the other hand, there were an infinite number of stars, distributed more or less uniformly over infinite space, this would not happen, because there would not be any central point for them to fall to. this argument is an instance of the pitfalls that you can encounter in talking about infinity. in an infinite universe, every point can be regarded as the center, because every point has an infinite number of stars on each side of it. the correct approach, it was realized only much later, is to consider the finite situation, in which the stars all fall in on each other, and then to a

In [28]:
import re

pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/404 [00:00<?, ?it/s]

697

In [29]:
import pandas as pd

df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,697.0,697.0,697.0,697.0
mean,92.46,837.68,146.74,209.42
std,56.7,488.64,81.69,122.16
min,-10.0,0.0,1.0,0.0
25%,43.0,404.0,72.0,101.0
50%,93.0,932.0,165.0,233.0
75%,140.0,1206.0,209.0,301.5
max,207.0,1936.0,322.0,484.0


In [30]:
%%time
from sentence_transformers import SentenceTransformer,util

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device='cuda')

CPU times: total: 1.17 s
Wall time: 4.32 s


In [31]:
embedding_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [32]:
for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"],batch_size=64)

  0%|          | 0/697 [00:00<?, ?it/s]

In [33]:
pages_and_chunks[111]

{'page_number': 54,
 'sentence_chunk': 'the phenomenon of interference between particles has been crucial to our understanding of the structure of atoms, the basic units of chemistry and biology and the building blocks out of which we, and everything around us, are made.at the beginning of this century it was thought that atoms were rather like the planets orbiting the sun, with electrons (particles of negative electricity) orbiting around a central nucleus, which carried positive electricity.the attraction between the positive and negative electricity was supposed to keep the electrons in their orbits in the same way that the gravitational attraction between the sun and the planets keeps the planets in their orbits.the trouble with this was that the laws of mechanics and electricity, before quantum mechanics, predicted that the electrons would lose energy and so spiral inward until they collided with the nucleus.this would mean that the atom, and indeed all matter, should rapidly coll

In [34]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [35]:
text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-6,also by stephen hawking a briefer history of t...,300,50,75.00,"[0.007761275, -0.043458454, 0.021624561, -0.04..."
1,-4,a brief history of time a bantam book publishi...,930,132,232.50,"[0.034367207, -0.040824458, -0.0057925847, -0...."
2,-4,"its trademark, consisting of the words “bantam...",331,48,82.75,"[0.06875393, 0.007447075, -0.031086687, 0.0038..."
3,-3,contents cover other books by this author titl...,607,103,151.75,"[0.0128484955, -0.019453801, -0.018313618, -0...."
4,-2,about the author,16,3,4.00,"[0.04096278, 0.057624783, -0.0049860245, 0.027..."
...,...,...,...,...,...,...
692,203,2 0 6 t h e u n i v e r s e i n a n u t s h e l l,49,25,12.25,"[-0.003972819, -0.019697886, -0.010792694, -0...."
693,204,2 0 7 g l o s s a r y,21,11,5.25,"[0.010606958, 0.018182505, 0.038448013, 0.0154..."
694,205,2 0 8 t h e u n i v e r s e i n a n u t s h e l l,49,25,12.25,"[-0.054821387, -0.0071016466, -0.0125736715, -..."
695,206,2 0 9 s u g g e s t e d f u r t h e r r e a d ...,53,27,13.25,"[-0.05764687, -0.027608752, 0.0080152005, -0.0..."


In [36]:
import torch
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")
embeddings = text_chunks_and_embeddings_df["embedding"].to_list()
tensor_embeddings = [torch.Tensor(embedding) for embedding in embeddings]
embedding_tensor = torch.stack(tensor_embeddings).to(device)

In [37]:
embedding_tensor

tensor([[ 0.0078, -0.0435,  0.0216,  ...,  0.0420, -0.0227, -0.0199],
        [ 0.0344, -0.0408, -0.0058,  ...,  0.0550, -0.0617, -0.0125],
        [ 0.0688,  0.0074, -0.0311,  ..., -0.0414, -0.0021, -0.0064],
        ...,
        [-0.0548, -0.0071, -0.0126,  ...,  0.0578, -0.0208, -0.0249],
        [-0.0576, -0.0276,  0.0080,  ...,  0.0491, -0.0014, -0.0075],
        [-0.0539, -0.0090, -0.0119,  ...,  0.0675,  0.0009, -0.0161]],
       device='cuda:0')

In [38]:
query = "what are black holes"
print(f"Query: {query}")

query_embedding = embedding_model.encode(query, convert_to_tensor=True)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embedding_tensor)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: what are black holes
Time take to get scores on 697 embeddings: 0.00020 seconds.


torch.return_types.topk(
values=tensor([0.6323, 0.5818, 0.5733, 0.5266, 0.5155], device='cuda:0'),
indices=tensor([148, 182, 544, 542, 173], device='cuda:0'))

In [39]:
print(pages_and_chunks[148]['sentence_chunk'])

t chapter 6 black holes he term black hole is of very recent origin.it was coined in 1969 by the american scientist john wheeler as a graphic description of an idea that goes back at least two hundred years, to a time when there were two theories about light: one, which newton favored, was that it was composed of particles; the other was that it was made of waves.we now know that really both theories are correct.by the wave/particle duality of quantum mechanics, light can be regarded as both a wave and a particle.under the theory that light is made up of waves, it was not clear how it would respond to gravity.but if light is composed of particles, one might expect them to be affected by gravity in the same way that cannonballs, rockets, and planets are.at first people thought that particles of light traveled infinitely fast, so gravity would not have been able to slow them down, but the discovery by roemer that light travels at a finite speed meant that gravity might have an important 

In [40]:
query = "what happens to a human when he/she enters a black hole"
print(f"Query: {query}")


query_embedding = embedding_model.encode(query, convert_to_tensor=True)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embedding_tensor)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: what happens to a human when he/she enters a black hole
Time take to get scores on 697 embeddings: 0.00016 seconds.


torch.return_types.topk(
values=tensor([0.5993, 0.5880, 0.5787, 0.5642, 0.5560], device='cuda:0'),
indices=tensor([164, 166, 561, 163, 204], device='cuda:0'))

In [41]:
pages_and_chunks[164]

{'page_number': 79,
 'sentence_chunk': 'the laws of science and our ability to predict the future would break down.however, any observer who remained outside the black hole would not be affected by this failure of predictability, because neither light nor any other signal could reach him from the singularity.this remarkable fact led roger penrose to propose the cosmic censorship hypothesis, which might be paraphrased as “god abhors a naked singularity.”in other words, the singularities produced by gravitational collapse occur only in places, like black holes, where they are decently hidden from outside view by an event horizon.strictly, this is what is known as the weak cosmic censorship hypothesis: it protects observers who remain outside the black hole from the consequences of the breakdown of predictability that occurs at the singularity, but it does nothing at all for the poor unfortunate astronaut who falls into the hole.there are some solutions of the equations of general relativ

In [42]:
def retrival_query_resources(query:str,
                             embeddings: torch.tensor,
                             model: SentenceTransformer=embedding_model,
                             indices_to_return:int = 5):



        query_embedding = model.encode(query,convert_to_tensor=True)
        start_time = timer()
        dot_scores = util.dot_score(query_embedding,embeddings)[0]
        end_time = timer()
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings :{end_time - start_time}")
        scores,indices = torch.topk(dot_scores,k=indices_to_return)
        return scores,indices

def print_query_results(query:str,embeddings,pages_and_chunks=pages_and_chunks):
    
    scores,indices = retrival_query_resources(query=query,
                                              embeddings=embeddings)

    for score,idx in zip(scores,indices):
        print(f"Score:{score*100:.2f}")
        print("Text:")
        print(pages_and_chunks[idx]['sentence_chunk'])
        print("\n")

In [43]:
print_query_results("what is Black Hole",embedding_tensor)

[INFO] Time taken to get scores on 697 embeddings :6.34000000445667e-05
Score:59.09
Text:
t chapter 6 black holes he term black hole is of very recent origin.it was coined in 1969 by the american scientist john wheeler as a graphic description of an idea that goes back at least two hundred years, to a time when there were two theories about light: one, which newton favored, was that it was composed of particles; the other was that it was made of waves.we now know that really both theories are correct.by the wave/particle duality of quantum mechanics, light can be regarded as both a wave and a particle.under the theory that light is made up of waves, it was not clear how it would respond to gravity.but if light is composed of particles, one might expect them to be affected by gravity in the same way that cannonballs, rockets, and planets are.at first people thought that particles of light traveled infinitely fast, so gravity would not have been able to slow them down, but the discovery 

In [1]:
#Building the LLM
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import BitsAndBytesConfig


In [2]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)

In [51]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [63]:

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            torch_dtype = torch.float16,
                                            quantization_config =quantization_config,
                                           )

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [64]:
import gc
torch.cuda.empty_cache()
gc.collect()


12632

In [65]:
print("[INFO] model paramters",sum([param.numel() for param in model.parameters()]))

[INFO] model paramters 3500412928


In [66]:
questions = [
    "What is the concept of spacetime and how does it relate to our understanding of the universe?",
    "How do black holes form, and what are the key characteristics that define them?",
    "What is the event horizon of a black hole, and what significance does it hold in the context of general relativity?",
    "Can anything escape from the gravitational pull of a black hole? If so, under what conditions?",
    "What is Hawking radiation, and how does it challenge our understanding of black holes?",
    "How do black holes affect the fabric of spacetime around them, and what implications does this have for the universe?",
    "What role do black holes play in the formation and evolution of galaxies?",
    "What are some proposed theories for what lies beyond the event horizon of a black hole?",
    "How do black holes distort space and time around them, and how is this effect observed?",
    "What are some potential applications or consequences of harnessing the power of black holes or manipulating spacetime?"
]


In [67]:
def prompt_fomatting(
    query:str, context_dict : list[dict]) -> str:
    context = "- " +  "\n- ".join([item['sentence_chunk'] for item in context_dict])
    
    base_prompt = """"Based on the following context , please answer the query.
    query: {query}
    context: {context} """
    base_prompt = base_prompt.format(context=context, query=query)

    dialogue_template = [
    {"role":"user",
    "content":base_prompt}]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)

 
    
    return base_prompt

In [68]:
import random

query = random.choice(questions)
print(f"{query}")

scores,indicies = retrival_query_resources(query,embedding_tensor)
scores,indicies

How do black holes distort space and time around them, and how is this effect observed?
[INFO] Time taken to get scores on 697 embeddings :6.259999963731389e-05


(tensor([0.5609, 0.5092, 0.4989, 0.4869, 0.4739], device='cuda:0'),
 tensor([420, 468, 399, 166, 551], device='cuda:0'))

In [69]:
context_items = [pages_and_chunks[i] for i in indicies]

prompt = prompt_fomatting(query,context_items)

In [70]:
%%time

input_ids = tokenizer(prompt,return_tensors='pt').to('cuda')

CPU times: total: 0 ns
Wall time: 4.01 ms


In [71]:
%%time

output = model.generate(**input_ids,
                        temperature=0.7,
                        do_sample=True,
                        max_new_tokens=256)

output_text = tokenizer.decode(output[0])

print(f"Query: {query}")
print(f"Rag Answer")

Query: How do black holes distort space and time around them, and how is this effect observed?
Rag Answer
CPU times: total: 5.45 s
Wall time: 30.5 s


In [75]:
output_text.replace(prompt," ").replace(prompt, "").replace("<s>", "").replace("</s>", "").replace("\n"," ")

'   - the singularity of a black hole is an interesting and important place.the singularity is a point where the curvature of spacetime becomes infinite, and the laws of physics as we know them break down.at the singularity, the gravitational pull is so strong that not even light can escape, and the space around the singularity is warped in a way that makes it impossible to predict what will happen next.the singularity is surrounded by an event horizon, which marks the boundary beyond which nothing, not even light, can escape.the event horizon is the point of no return, and anything that crosses it will be pulled into the singularity.the singularity is a fascinating and mysterious place, and scientists are still trying to understand what happens there.'

In [73]:
print_query_results(query,embedding_tensor)

[INFO] Time taken to get scores on 697 embeddings :8.200000002034358e-05
Score:56.09
Text:
35 t h e s h a p e o f t i m e general relativity combines the time dimension with the three dimensions of space to form what is called spacetime (see page 3 3 , fig.2 .3 ) .the theory incorporates the effect of gravity by saying that the distribution of matter and energy in the universe warps and distorts spacetime, so that it is not flat.objects in this spacetime try to move in straight lines, but because spacetime is curved, their paths appear bent.they move as if affected by a gravitational field. as a rough analogy, not to be taken too literally, imagine a sheet of rubber.one can place a large ball on the sheet to represent the sun.the weight of the ball will depress the sheet and cause it to be curved near the sun.if one now rolls little ball bearings on the sheet, they won't roll straight across to the other side but instead will go around the heavy weight, like planets orbiting the sun (f