## Document Preprocessing and Embedding Creation

#### Using PDF:


In [8]:
import os 

pdf_path = "cuda_book.pdf"

if not os.path.exists(pdf_path):
    print(f'[INFO] File{pdf_path} doesn\'t exist')


In [14]:
import fitz 
from tqdm.auto import tqdm 

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def text_formatter(text:str)->str:
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

In [22]:
def open_read_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_texts = []

    for page_number, content in tqdm(enumerate(doc)):
        text = content.get_text()
        text = text_formatter(text=text)
        pages_texts.append({"page no":page_number-20,
                           "page_char_count": len(text),
                           "page_word_count" : len(text.split(" ")), 
                           "page_sentence_count_raw" : len(text.split(". ")), 
                           "page_token_count" : len(text)/4 , # 1 token = 4 characters
                           "text" : text })

    return pages_texts
        

In [23]:
pages_and_texts = open_read_pdf(pdf_path=pdf_path)
pages_and_texts[:3]

311it [00:00, 487.38it/s]


[{'page no': -20,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page no': -19,
  'page_char_count': 15,
  'page_word_count': 3,
  'page_sentence_count_raw': 1,
  'page_token_count': 3.75,
  'text': 'CUDA by Example'},
 {'page no': -18,
  'page_char_count': 34,
  'page_word_count': 5,
  'page_sentence_count_raw': 1,
  'page_token_count': 8.5,
  'text': 'This page intentionally left blank'}]

In [24]:
import random 

random.sample(pages_and_texts, k=2)

[{'page no': 189,
  'page_char_count': 1314,
  'page_word_count': 431,
  'page_sentence_count_raw': 4,
  'page_token_count': 328.5,
  'text': 'EEE LLL KKK DDD HHH SSSTTTM MM emory 189 emory After the 100 copies, clean up by freeing the host and GPU buffers as well as  destroying our timing events.     free( a );     HANDLE_ERROR( cudaFree( dev_a ) );      HANDLE_ERROR( cudaEventDestroy( start ) );     HANDLE_ERROR( cudaEventDestroy( stop ) );     return elapsedTime;  } If you didn’t notice, the function cuda_malloc_test() allocated pageable host  memory with the standard C malloc() routine. The pinned memory version  uses cudaHostAlloc() to allocate a page-locked buffer. float cuda_host_alloc_test( int size, bool up ) {     cudaEvent_t     start, stop;     int             *a, *dev_a;     float           elapsedTime;     HANDLE_ERROR( cudaEventCreate( &start ) );     HANDLE_ERROR( cudaEventCreate( &stop ) );     HANDLE_ERROR( cudaHostAlloc( (void**)&a,                                  s

In [25]:
import pandas as pd 

In [27]:
df = pd.DataFrame(pages_and_texts)
df.sample(4)

Unnamed: 0,page no,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
219,199,1450,268,8,362.5,usInG multIPle cudA streAms 199 tream s broke ...
223,203,1641,797,2,410.25,usInG multIPle cudA streAms 203 tream s //...
301,281,3052,439,5,763.0,"ndex 281 copy_constant_kernel(), computing te..."
299,279,2134,316,2,533.5,"279 Index A add() function, CPU vector sums, 4..."


In [29]:
df.describe().round(2)

Unnamed: 0,page no,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,311.0,311.0,311.0,311.0,311.0
mean,135.0,1549.69,332.66,12.72,387.42
std,89.92,666.5,135.34,35.82,166.63
min,-20.0,0.0,1.0,1.0,0.0
25%,57.5,1189.0,255.5,5.0,297.25
50%,135.0,1531.0,357.0,8.0,382.75
75%,212.5,1977.0,415.5,13.0,494.25
max,290.0,3329.0,798.0,499.0,832.25


## Why token counts?

- Embedding models don't deal with infinite tokens
- LLMs need finite tokens in their context window  

In [36]:
# get sentences count 
from spacy.lang.en import English

In [40]:
nlp = English()
nlp 

<spacy.lang.en.English at 0x1ad592a5810>

In [41]:
nlp.add_pipe("sentencizer") # to split sentences

<spacy.pipeline.sentencizer.Sentencizer at 0x1ad598911d0>

In [44]:
sentences = nlp("A dog eats bone. A cat eats rat. A rat eats grains.")
list(sentences.sents)

[A dog eats bone., A cat eats rat., A rat eats grains.]

In [46]:
for text in tqdm(pages_and_texts):
    text["sentences"] = list(nlp(text["text"]).sents)

    text["sentences"] = [str(sentence) for sentence in text["sentences"]]

    text["sentences_count_spacy"] = len(text["sentences"])

100%|█████████████████████████| 311/311 [00:00<00:00, 338.68it/s]


In [50]:
random.sample(pages_and_texts, k=1)

[{'page no': 271,
  'page_char_count': 1747,
  'page_word_count': 501,
  'page_sentence_count_raw': 9,
  'page_token_count': 436.75,
  'text': '271 able     if (count != ELEMENTS)         printf( “%d elements found in hash table.  Should be %ld\\n”,                 count, ELEMENTS );     else         printf( “All %d elements found in hash table.\\n”, count );     free( table.pool );     free( table.entries );  } Since we chose to reuse our CPU implementation of verify_table(), we need a  function to copy the table from GPU memory to host memory. There are three steps  to this function, two relatively obvious steps and a third, trickier step. The first two  steps involve allocating host memory for the hash table data and performing a copy  of the GPU data structures into this memory with cudaMemcpy(). We have done  this many times previously, so this should come as no surprise. void copy_table_to_host( const Table &table, Table &hostTable) {     hostTable.count = table.count;     hostTa

In [51]:
df = pd.DataFrame(pages_and_texts)
df.describe()

Unnamed: 0,page no,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences_count_spacy
count,311.0,311.0,311.0,311.0,311.0,311.0
mean,135.0,1549.691318,332.655949,12.717042,387.42283,9.691318
std,89.922189,666.503734,135.338884,35.824532,166.625933,9.624627
min,-20.0,0.0,1.0,1.0,0.0,0.0
25%,57.5,1189.0,255.5,5.0,297.25,5.0
50%,135.0,1531.0,357.0,8.0,382.75,8.0
75%,212.5,1977.0,415.5,13.0,494.25,13.0
max,290.0,3329.0,798.0,499.0,832.25,113.0


#### Chunking

- splitting larger texts into smaller ones

In [71]:
# split size to turn group of sentences into chunks
chunk_size = 9

def split_text(input_list: list, 
              slice_size: int = chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

In [72]:
test_list = list(range(25))
split_text(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8],
 [9, 10, 11, 12, 13, 14, 15, 16, 17],
 [18, 19, 20, 21, 22, 23, 24]]

In [73]:
# chunking our pages
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_text(input_list=item["sentences"],
                                        slice_size = chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████████████████| 311/311 [00:00<00:00, 154169.55it/s]


In [74]:
random.sample(pages_and_texts, 1)

[{'page no': 190,
  'page_char_count': 1611,
  'page_word_count': 408,
  'page_sentence_count_raw': 8,
  'page_token_count': 402.75,
  'text': 'streAms 190             HANDLE_ERROR( cudaMemcpy( a, dev_a,                                   size * sizeof( *a ),                                   cudaMemcpyDeviceToHost ) );     }     HANDLE_ERROR( cudaEventRecord( stop, 0 ) );     HANDLE_ERROR( cudaEventSynchronize( stop ) );     HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,                                         start, stop ) );     HANDLE_ERROR( cudaFreeHost( a ) );     HANDLE_ERROR( cudaFree( dev_a ) );     HANDLE_ERROR( cudaEventDestroy( start ) );     HANDLE_ERROR( cudaEventDestroy( stop ) );     return elapsedTime;  } As you can see, the buffer allocated by cudaHostAlloc() is used in the same  way as a buffer allocated by malloc(). The other change from using malloc() lies in the last argument, the value cudaHostAllocDefault. This last argu- ment stores a collection of flags that

In [76]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(3)

Unnamed: 0,page no,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences_count_spacy,num_chunks
count,311.0,311.0,311.0,311.0,311.0,311.0,311.0
mean,135.0,1549.691,332.656,12.717,387.423,9.691,1.55
std,89.922,666.504,135.339,35.825,166.626,9.625,1.064
min,-20.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,57.5,1189.0,255.5,5.0,297.25,5.0,1.0
50%,135.0,1531.0,357.0,8.0,382.75,8.0,1.0
75%,212.5,1977.0,415.5,13.0,494.25,13.0,2.0
max,290.0,3329.0,798.0,499.0,832.25,113.0,13.0


In [94]:
# splitting chunks into separate items and joining sentences together
import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page no"]

        # join list of sentences into a paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A"

        
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_character_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 


        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|███████████████████████| 311/311 [00:00<00:00, 15065.82it/s]


482

In [97]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 261,
  'sentence_chunk': '261 able aardvark avocado aardvark avocado Figure A.3 Resolving the conflict when adding the word aardvark Armed with some background on the notions of a hash function and collision reso- lution, we’re ready to take a look at implementing our own hash table. A CPU HASH TABLE A.2.2 As described in the previous section, our hash table will consist of essentially two parts: a hash function and a data structure of buckets. Our buckets will be imple- mented exactly as before: We will allocate an array of length N, and each entry in the array holds a list of key/value pairs. Before concerning ourselves with a hash function, we will take a look at the data structures involved: #include "../common/book.h" struct Entry {   unsigned int  key;   void*      value;   Entry      *next; }; struct Table {   size_t count;   Entry  **entries;   Entry  *pool;   Entry  *firstFree; };',
  'chunk_character_count': 876,
  'chunk_word_count': 171,
  'chunk_token_coun

In [98]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunk_character_count,chunk_word_count,chunk_token_count
count,482.0,482.0,482.0,482.0
mean,122.460581,963.553942,178.643154,240.888485
std,93.459807,576.915757,103.398394,144.228939
min,-19.0,3.0,1.0,0.75
25%,39.25,578.0,111.25,144.5
50%,118.0,1055.0,187.0,263.75
75%,204.75,1309.5,247.75,327.375
max,290.0,3265.0,509.0,816.25


In [118]:
# chunks with lower tokens than 30
min_chunk_token = 30
for row in df[df['chunk_token_count'] <= min_chunk_token].sample(5).iterrows():
    print(f'Chunk token count:{row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count:8.5 | Text: This page intentionally left blank
Chunk token count:4.25 | Text: About the Authors
Chunk token count:8.5 | Text: This page intentionally left blank
Chunk token count:3.75 | Text: CUDA by Example
Chunk token count:4.5 | Text: . . . .11 Contents


In [119]:
pages_and_chunks_enough_tokens = df[df["chunk_token_count"] > min_chunk_token].to_dict(orient="records")
pages_and_chunks_enough_tokens[:2]

[{'page_number': -17,
  'sentence_chunk': 'CUDA by Example g JAson sAnders edwArd KAndrot Upper Saddle River, NJ • Boston • Indianapolis • San Francisco New York • Toronto • Montreal • London • Munich • Paris • Madrid Capetown • Sydney • Tokyo • Singapore • Mexico City',
  'chunk_character_count': 226,
  'chunk_word_count': 43,
  'chunk_token_count': 56.5},
 {'page_number': -16,
  'sentence_chunk': 'Many of the designations used by manufacturers and sellers to distinguish their products are claimed as trademarks. Where those designations appear in this book, and the publisher was aware of a trademark claim, the designations have been printed with initial capital letters or in all capitals. The authors and publisher have taken care in the preparation of this book, but make no expressed or implied warranty of any kind and assume no responsibility for errors or omissions. No liability is assumed for incidental or consequential damages in connection with or arising out of the use of the in

In [120]:
len(pages_and_chunks_enough_tokens)

426

### Embedding

- chunks to numbers

In [123]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path = 'all-mpnet-base-v2',
                                     device='cpu')

sentences = ["I am a person.",
            "I am from Nepal.",
            "I am from Earth."]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))





In [128]:
for sentence, embedding in embeddings_dict.items():
    print(f'Sentence: {sentence}')
    print(f'Embedding: {embedding.shape}')
    break 

Sentence: I am a person.
Embedding: (768,)


In [135]:
%%time

embedding_model.to('cpu')

for item in tqdm(pages_and_chunks_enough_tokens):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████████████████████| 426/426 [00:57<00:00,  7.36it/s]

CPU times: total: 7min 43s
Wall time: 58.2 s





In [136]:
%%time

embedding_model.to('cuda')

for item in tqdm(pages_and_chunks_enough_tokens):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████████████████████| 426/426 [00:07<00:00, 53.28it/s]

CPU times: total: 1min 4s
Wall time: 8.25 s





In [137]:
# creating batches
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_enough_tokens]

In [142]:
%%time
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                              batch_size=32,
                                              convert_to_tensor=True)

text_chunk_embeddings.shape

CPU times: total: 25.3 s
Wall time: 6.9 s


torch.Size([426, 768])

In [143]:
text_chunk_embeddings[412]

tensor([-6.5104e-04,  8.7808e-03, -2.3617e-02, -3.9672e-03, -4.7210e-02,
         2.3732e-02,  3.2628e-02,  1.6957e-02,  1.4056e-02,  4.8171e-03,
         3.8954e-02, -2.2313e-03,  1.6538e-02,  1.4528e-02, -2.6746e-02,
         3.1120e-03,  1.9372e-02,  8.3560e-03,  1.0736e-02, -5.7314e-03,
         3.8960e-02, -2.5745e-02,  2.3988e-02,  2.2226e-02, -1.6475e-02,
        -2.9316e-02, -2.3492e-02,  3.1946e-02,  4.3804e-02,  6.8989e-02,
        -6.8026e-02, -5.6173e-03,  1.9286e-03,  1.8211e-02,  2.1097e-06,
         5.0570e-02, -5.4959e-02,  2.4807e-02,  4.5095e-02, -2.7462e-02,
         4.5699e-02, -5.6143e-02, -1.8512e-02, -3.8574e-02, -1.4848e-02,
         2.0480e-02, -1.6504e-02,  1.6074e-02, -4.6710e-02, -6.5593e-02,
         8.8646e-03,  8.5863e-02, -1.9725e-02,  2.8828e-02, -2.8797e-02,
         3.2957e-02, -5.0934e-02,  1.3347e-02,  6.6375e-03, -6.0752e-02,
        -4.6561e-02,  5.3337e-02,  8.6391e-04, -3.8480e-02,  1.6685e-02,
         1.2802e-02, -1.1374e-02,  1.0626e-02,  6.6

#### Saving the embedding file

In [144]:
chunks_embeddings_df = pd.DataFrame(pages_and_chunks_enough_tokens)
df_path = "text_chunks_embeddings_df.csv"

chunks_embeddings_df.to_csv(df_path, index=False)