In [57]:
from colorama import Fore, Style

def print_message(message_type, message):
    if message_type == "INFO":
        print(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} {message}")
    elif message_type == "ERROR":
        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
    elif message_type == "SUCCESS":
        print(f"{Fore.GREEN}[SUCESS]{Style.RESET_ALL} {message}")
    else:
        print(f"{message}")

print_message("INFO", "This is an info")
print_message("ERROR", "This is an error")

[33m[INFO][0m This is an info
[31m[ERROR][0m This is an error


In [58]:
import os
import requests
from tqdm import tqdm

In [59]:
def download_file_with_progress(url, output_path):
   
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        with open(output_path, 'wb') as file, tqdm(
            desc=output_path,
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as progress_bar:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
                    progress_bar.update(len(chunk))
        print_message("SUCCESS", "File has been successfully downloaded.")
    else:
        print_message("ERROR", f"Something went wrong. Status code: {response.status_code}")

In [60]:
def insert_pdf_file(pdf_file_name: str) -> str:
    if pdf_file_name[-4:] != ".pdf":
        pdf_file_name += ".pdf"
    if not os.path.exists(pdf_file_name):
        print_message("INFO", "File doesn't exist, Insert Url here")
        url = input(">")

        download_file_with_progress(url, pdf_file_name)
    else:
        print_message("SUCCESS", "The file already exists")
        return pdf_file_name

In [61]:
pdf_file_name = "Pattern Recognition and Machine - Christopher M. Bishop"
pdf_file_name = insert_pdf_file(pdf_file_name=pdf_file_name)

[32m[SUCESS][0m The file already exists


In [62]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    cleaned_text = text.replace('\n', ' ').strip()
    return cleaned_text

def open_and_read_pdf(pdf_file_name: str) -> list[dict]:
    doc = fitz.open(pdf_file_name)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({
                "page_number": page_number + 1,
                "page_char_count": len(text),
                "page_word_count": len(text.split(' ')),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text) / 4,
                "text": text
        })
    return pages_and_texts

In [63]:
pages_and_texts = open_and_read_pdf(pdf_file_name=pdf_file_name)

0it [00:00, ?it/s]

In [65]:
def store_and_embed_pdf_file(pdf_file_name: str):
    pdf_file_name = insert_pdf_file(pdf_file_name=pdf_file_name)
    pages_and_texts = open_and_read_pdf(pdf_file_name_pdf_file_name)
    

In [66]:
import random 

random.sample(pages_and_texts, k=3)

[{'page_number': 580,
  'page_char_count': 3050,
  'page_word_count': 493,
  'page_sentence_count_raw': 27,
  'page_token_count': 762.5,
  'text': "560 12. CONTINUOUS LATENT VARIABLES Figure 12.1 A synthetic data sel obtained by taking one of the off-line digit images and creating multi- ple copies in each of which the digit has undergone a random displacement and rotation within some larger image field. The resulting images each have 100 )( 100 = 10.000 pixels. that the manifold will be nonlinear because. for instance. if we translate the digit past a particular pixel, that pixel value will go from zero (white) 10 one (black) and back to zero again. which is clearly a nonlinear function of the digit position. In this example. !.he lranslation and rotation parameters are latent variables because we observe only the image vectors and are not told which values of the translation or rotation variables were used to create them. For real digit image data, there will be a funher degree of fr

## Get some more info on the data of the book

In [67]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,0,1,1,0.0,
1,2,87,12,4,21.75,Information Science and Statistics Series Edit...
2,3,928,128,12,232.0,Information Science and Statistics Akaike and...
3,4,62,8,2,15.5,Christopher M. Bishop Pattern Recognition and ...
4,5,1468,212,10,367.0,Christopher M. Bishop F.R.Eng. Assistant Direc...


In [68]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,758.0,758.0,758.0,758.0,758.0
mean,379.5,2244.17,391.68,27.48,561.04
std,218.96,699.94,125.23,75.02,174.98
min,1.0,0.0,1.0,1.0,0.0
25%,190.25,1846.0,329.0,14.0,461.5
50%,379.5,2260.5,398.5,17.0,565.12
75%,568.75,2734.0,466.75,21.0,683.5
max,758.0,3828.0,1046.0,836.0,957.0


## Splitting pages into sentences

In [69]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence. I like this.")
assert len(list(doc.sents)) == 3

list(doc.sents)


[This is a sentence., This is another sentence., I like this.]

In [70]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item['text']).sents)

    item['sentences'] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/758 [00:00<?, ?it/s]

In [71]:
random.sample(pages_and_texts, k=1)

[{'page_number': 657,
  'page_char_count': 2328,
  'page_word_count': 379,
  'page_sentence_count_raw': 16,
  'page_token_count': 582.0,
  'text': '13.3. Linear Dynamical Systems 637 model for that particular observation. However, the latent variables {zn} are no longer treated as independent but now form a Markov chain. Because the model is represented by a tree-structured directed graph, inference problems can be solved efﬁciently using the sum-product algorithm. The forward re- cursions, analogous to the α messages of the hidden Markov model, are known as the Kalman ﬁlter equations (Kalman, 1960; Zarchan and Musoff, 2005), and the back- ward recursions, analogous to the β messages, are known as the Kalman smoother equations, or the Rauch-Tung-Striebel (RTS) equations (Rauch et al., 1965). The Kalman ﬁlter is widely used in many real-time tracking applications. Because the linear dynamical system is a linear-Gaussian model, the joint distri- bution over all variables, as well as all 

In [72]:
df = pd.DataFrame(pages_and_texts)
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,758.0,758.0,758.0,758.0,758.0,758.0
mean,380.0,2244.0,392.0,27.0,561.0,17.0
std,219.0,700.0,125.0,75.0,175.0,12.0
min,1.0,0.0,1.0,1.0,0.0,0.0
25%,190.0,1846.0,329.0,14.0,462.0,12.0
50%,380.0,2260.0,398.0,17.0,565.0,16.0
75%,569.0,2734.0,467.0,21.0,684.0,19.0
max,758.0,3828.0,1046.0,836.0,957.0,94.0


### Making chunks from the text 

In [73]:
num_sentence_chunk_size = 10 

def split_list(input_list: list, slice_size: int =num_sentence_chunk_size) -> list[str]:
    return [input_list[i:i+slice_size + 1] for i in range(0, len(input_list), slice_size)]

In [74]:
tl = list(range(25))
split_list(tl)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [20, 21, 22, 23, 24]]

In [75]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                        slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/758 [00:00<?, ?it/s]

In [76]:
random.sample(pages_and_texts, k=2)

[{'page_number': 187,
  'page_char_count': 1732,
  'page_word_count': 330,
  'page_sentence_count_raw': 13,
  'page_token_count': 433.0,
  'text': '3.5. The Evidence Approximation 167 where M is the dimensionality of w, and we have deﬁned E(w) = βED(w) + αEW (w) = β 2 ∥t −Φw∥2 + α 2 wTw. (3.79) We recognize (3.79) as being equal, up to a constant of proportionality, to the reg- ularized sum-of-squares error function (3.27). We now complete the square over w Exercise 3.18 giving E(w) = E(mN) + 1 2(w −mN)TA(w −mN) (3.80) where we have introduced A = αI + βΦTΦ (3.81) together with E(mN) = β 2 ∥t −ΦmN∥2 + α 2 mT NmN. (3.82) Note that A corresponds to the matrix of second derivatives of the error function A = ∇∇E(w) (3.83) and is known as the Hessian matrix. Here we have also deﬁned mN given by mN = βA−1ΦTt. (3.84) Using (3.54), we see that A = S−1 N , and hence (3.84) is equivalent to the previous deﬁnition (3.53), and therefore represents the mean of the posterior distribution. The integr

In [77]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,758.0,758.0,758.0,758.0,758.0,758.0,758.0
mean,379.5,2244.17,391.68,27.48,561.04,17.08,2.17
std,218.96,699.94,125.23,75.02,174.98,12.09,1.25
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,190.25,1846.0,329.0,14.0,461.5,12.0,2.0
50%,379.5,2260.5,398.5,17.0,565.12,16.0,2.0
75%,568.75,2734.0,466.75,21.0,683.5,19.0,2.0
max,758.0,3828.0,1046.0,836.0,957.0,94.0,10.0


In [78]:
import re

pages_and_chunks = []
for i in tqdm(pages_and_texts):
    for sentence_chunk in i['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["page_number"] = i["page_number"]
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)
    
len(pages_and_chunks)

  0%|          | 0/758 [00:00<?, ?it/s]

1645

In [79]:
pages_and_chunks

[{'page_number': 2,
  'sentence_chunk': 'Information Science and Statistics Series Editors: M. Jordan J. Kleinberg B. Scho¨lkopf',
  'chunk_char_count': 87,
  'chunk_word_count': 12,
  'chunk_token_count': 21.75},
 {'page_number': 3,
  'sentence_chunk': 'Information Science and Statistics Akaike and Kitagawa: The Practice of Time Series Analysis. Bishop: Pattern Recognition and Machine Learning. Cowell, Dawid, Lauritzen, and Spiegelhalter: Probabilistic Networks and Expert Systems. Doucet, de Freitas, and Gordon: Sequential Monte Carlo Methods in Practice. Fine: Feedforward Neural Network Methodology. Hawkins and Olwell: Cumulative Sum Charts and Charting for Quality Improvement. Jensen: Bayesian Networks and Decision Graphs. Marchette: Computer Intrusion Detection and Network Monitoring: A Statistical Viewpoint. Rubinstein and Kroese: The Cross-Entropy Method: A Unified Approach to Combinatorial Optimization, Monte Carlo Simulation, and Machine Learning. Studený: Probabilistic Conditi

In [80]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 648,
  'sentence_chunk': '628 13. SEQUENTIAL DATA From the product rule, we then have p(x1, . . . ,xn) = n \x0e m=1 cm (13.57) and so α(zn) = p(zn|x1, . . . ,xn)p(x1, . . . ,xn) = \x16 n \x0e m=1 cm \x17 \x01α(zn). (13.58) We can then turn the recursion equation (13.36) for α into one for \x01α given by cn\x01α(zn) = p(xn|zn) \x02 zn−1 \x01α(zn−1)p(zn|zn−1). (13.59) Note that at each stage of the forward message passing phase, used to evaluate \x01α(zn), we have to evaluate and store cn, which is easily done because it is the coefﬁcient that normalizes the right-hand side of (13.59) to give \x01α(zn). We can similarly deﬁne re-scaled variables \x01β(zn) using β(zn) = \x16 N \x0e m=n+1 cm \x17 \x01β(zn) (13.60) which will again remain within machine precision because, from (13.35), the quan- tities \x01β(zn) are simply the ratio of two conditional probabilities \x01β(zn) = p(xn+1, . . . ,xN|zn) p(xn+1, . . . ,xN|x1, . . . ,xn). (',
  'chunk_char_count': 862,
  'chunk_wo

In [81]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1645.0,1645.0,1645.0,1645.0
mean,404.13,1104.57,191.93,276.14
std,228.61,580.9,101.72,145.22
min,2.0,3.0,1.0,0.75
25%,208.0,536.0,89.0,134.0
50%,412.0,1196.0,209.0,299.0
75%,605.0,1576.0,272.0,394.0
max,758.0,2596.0,446.0,649.0


## Filter out chunks with less than 20 tokens

In [82]:
min_token_length = 20
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 1.25 | Text: 8.45)
Chunk token count: 0.75 | Text: 173
Chunk token count: 10.25 | Text: C.3) Also we have AT−1 = A−1T (C.4) 695
Chunk token count: 16.25 | Text: N, as illustrated in Figure 2.17. We can average the vectors {xn}
Chunk token count: 13.25 | Text: 1. Choose an initial setting for the parameters θold.


In [85]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [86]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 207,
  'sentence_chunk': '4.1. Discriminant Functions 187 −6 −4 −2 0 2 4 6 −6 −4 −2 0 2 4 6 −6 −4 −2 0 2 4 6 −6 −4 −2 0 2 4 6 Figure 4.5 Example of a synthetic data set comprising three classes, with training data points denoted in red (×), green (+), and blue (◦). Lines denote the decision boundaries, and the background colours denote the respective classes of the decision regions. On the left is the result of using a least-squares discriminant. We see that the region of input space assigned to the green class is too small and so most of the points from this class are misclassiﬁed. On the right is the result of using logistic regressions as described in Section 4.3.2 showing correct classiﬁcation of the training data.dimensional input vector x and project it down to one dimension using y = wTx. (4.20) If we place a threshold on y and classify y ⩾−w0 as class C1, and otherwise class C2, then we obtain our standard linear classiﬁer discussed in the previous section. In 

## Embedding chunks

In [None]:
test_sentences = ["This is a test for the embedding model",
                 "this is a second sentence for the model",
                 "The sky is blue"]

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device="cuda")


embeddings = embedding_model.encode(test_sentences,
                                    batch_size=32,
                                    convert_to_tensor=True)


In [None]:
sentences_and_embeddings = dict(zip(test_sentences, embeddings))

In [None]:
sentences_and_embeddings

In [87]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1608 [00:00<?, ?it/s]

CPU times: user 2min 30s, sys: 2.3 s, total: 2min 32s
Wall time: 38.4 s


# TODO
## Creating a chromadb client for storing embeddings

> **NOTE** maybe not
>
> [INFO] Time taken to get scores on 1765000 embeddings: 0.00286 seconds.

In [None]:
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.create_collection(name="embeddings")

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [None]:
# results = collection.query(
#     query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
#     n_results=2 # how many results to return
# )
# print(results)

In [None]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
random.sample(text_chunks, k=3)

## adding the embeddings to the chunks

In [88]:
pages_and_chunks_over_min_token_len[:4]

[{'page_number': 2,
  'sentence_chunk': 'Information Science and Statistics Series Editors: M. Jordan J. Kleinberg B. Scho¨lkopf',
  'chunk_char_count': 87,
  'chunk_word_count': 12,
  'chunk_token_count': 21.75,
  'embedding': array([-3.09493840e-02, -1.05464682e-02, -4.61412333e-02, -7.72602903e-03,
         -5.76722100e-02, -8.91559571e-03,  7.67798573e-02,  1.60053223e-02,
         -4.95475419e-02, -1.26838880e-02,  3.45496796e-02,  1.07824635e-02,
          4.47062664e-02,  4.02125530e-02,  1.94900054e-02, -3.72747332e-02,
          1.50199560e-02, -9.41928010e-03, -4.97012213e-03,  9.32618044e-03,
         -5.36001883e-02, -2.54126098e-02,  9.80052445e-03,  4.50976230e-02,
         -1.16578154e-02,  9.92208254e-03,  1.06096398e-02, -2.87685916e-03,
          2.36121304e-02, -9.50062498e-02,  2.34247390e-02,  5.50590232e-02,
          1.83683969e-02,  2.60935607e-03,  2.28428007e-06, -3.59321870e-02,
         -8.93514324e-03,  2.49875784e-02,  6.09413907e-03,  4.37932424e-02,
    

In [91]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [93]:
# Embed all the text in batches
start_time = timer()
text_chunks_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)
end_time = timer()
print(end_time - start_time)
text_chunks_embeddings

18.21227113999339


tensor([[-0.0309, -0.0105, -0.0461,  ..., -0.0325, -0.0397, -0.0037],
        [-0.0338,  0.0253, -0.0452,  ...,  0.0210, -0.0232, -0.0210],
        [-0.0350,  0.0008, -0.0244,  ...,  0.0431, -0.0605, -0.0090],
        ...,
        [ 0.0285, -0.0569, -0.0206,  ..., -0.0556, -0.0315, -0.0363],
        [-0.0350, -0.0538, -0.0051,  ..., -0.0569, -0.0199, -0.0539],
        [-0.0362, -0.0237, -0.0083,  ..., -0.0173, -0.0344, -0.0443]],
       device='cuda:0')

### Saving embeddings into a file (Temp)

In [95]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = f"embeddings/{pdf_file_name}.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
# reading the csv 

text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
text_chunks_and_embedding_df_load["sentence_chunk"].iloc[357]

# Rag - Search and Answer

### Similarity search

In [1]:
from colorama import Fore, Style

def print_message(message_type, message):
    if message_type == "INFO":
        print(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} {message}")
    elif message_type == "ERROR":
        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
    elif message_type == "SUCCESS":
        print(f"{Fore.GREEN}[SUCESS]{Style.RESET_ALL} {message}")
    else:
        print(f"{message}")

print_message("INFO", "This is an info")
print_message("ERROR", "This is an error")

[33m[INFO][0m This is an info
[31m[ERROR][0m This is an error


In [2]:
import random

import torch
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("embeddings/Hands-On Machine Learning With - Aurelien Geron.pdf.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)

In [4]:
embeddings

tensor([[-0.0279, -0.0017, -0.0482,  ...,  0.0007, -0.0182, -0.0057],
        [ 0.0249,  0.0614, -0.0523,  ...,  0.0169, -0.0166, -0.0051],
        [ 0.0097, -0.0224, -0.0410,  ..., -0.0104, -0.0108, -0.0204],
        ...,
        [ 0.0017,  0.0756, -0.0428,  ...,  0.0299,  0.0373, -0.0241],
        [-0.0071,  0.0351, -0.0043,  ...,  0.0399,  0.0262, -0.0321],
        [ 0.0198,  0.0750, -0.0211,  ...,  0.0131, -0.0042, -0.0147]],
       device='cuda:0')

In [5]:
# creeating the model, this is just used if you havent already run the model above

from sentence_transformers import util, SentenceTransformer


embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device)

## semantic search pipeline


1. Define a query string.
2. Turn the query string into an embedding
3. Perform a dot product or cosine similarity function between the text embedding and the query embedding
4. Sort the results from k in descending order

In [6]:
query = "What is ridge regression"
print(f"Query: {query}")

# embed query
query_embedding = embedding_model.encode(query, convert_to_tensor=True)


# Get similarity scores with dot product (use cosine similarity if outputs are not normalized)

from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print_message("INFO", f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

# 4 get top-k results

top_results_dot_product = torch.topk(dot_scores, k=100)
top_results_dot_product

Query: What is ridge regression
[33m[INFO][0m Time taken to get scores on 1765 embeddings: 0.00046 seconds.


torch.return_types.topk(
values=tensor([0.7175, 0.6307, 0.6103, 0.5763, 0.5352, 0.5195, 0.5078, 0.4492, 0.4425,
        0.4396, 0.4374, 0.4062, 0.4033, 0.4011, 0.3949, 0.3927, 0.3879, 0.3842,
        0.3812, 0.3735, 0.3730, 0.3730, 0.3713, 0.3667, 0.3665, 0.3659, 0.3605,
        0.3584, 0.3571, 0.3552, 0.3549, 0.3542, 0.3535, 0.3526, 0.3486, 0.3475,
        0.3468, 0.3438, 0.3430, 0.3423, 0.3415, 0.3402, 0.3396, 0.3361, 0.3328,
        0.3327, 0.3326, 0.3321, 0.3318, 0.3262, 0.3256, 0.3254, 0.3240, 0.3233,
        0.3217, 0.3209, 0.3197, 0.3193, 0.3192, 0.3190, 0.3183, 0.3182, 0.3158,
        0.3149, 0.3141, 0.3110, 0.3094, 0.3072, 0.3068, 0.3060, 0.3059, 0.3057,
        0.3053, 0.3015, 0.3012, 0.3012, 0.3012, 0.2999, 0.2991, 0.2980, 0.2968,
        0.2956, 0.2940, 0.2934, 0.2933, 0.2925, 0.2922, 0.2919, 0.2918, 0.2905,
        0.2904, 0.2903, 0.2901, 0.2893, 0.2893, 0.2889, 0.2877, 0.2874, 0.2870,
        0.2868], device='cuda:0'),
indices=tensor([ 309,  311,  312,  310,  315,  313,  

In [7]:
text_chunks_and_embedding_df["sentence_chunk"].iloc[309]



In [8]:
dot = torch.dot(embeddings[309], query_embedding)
print(f"just dot prod {dot:.4f}")
dot = dot / (torch.sqrt(torch.sum(embeddings[309] ** 2)) *  torch.sqrt(torch.sum(query_embedding ** 2))) 
print(f"cosine similarity {dot:.4f}")


just dot prod 0.7175
cosine similarity 0.7175


In [9]:
 (torch.sqrt(torch.sum(embeddings[309] ** 2)) *  torch.sqrt(torch.sum(query_embedding ** 2))) 

tensor(1., device='cuda:0')

## Testing $ \times 1000 $ embeddings

In [12]:
larger_embeddings = torch.rand(1000*embeddings.shape[0], 768).to(device)
print(f'Embeddings shape {larger_embeddings.shape}')

Embeddings shape torch.Size([1765000, 768])


In [13]:
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()
# print(dot_scores.shape)

print_message("INFO", f"Time taken to get scores on {len(larger_embeddings)} embeddings: {end_time - start_time:.5f} seconds.")


top_results_dot_product = torch.topk(dot_scores, k=100)
top_results_dot_product

[33m[INFO][0m Time taken to get scores on 1765000 embeddings: 0.04798 seconds.


torch.return_types.topk(
values=tensor([1.5873, 1.5229, 1.5146, 1.4652, 1.4617, 1.4296, 1.4242, 1.4206, 1.4156,
        1.4123, 1.4111, 1.4101, 1.4052, 1.4035, 1.3984, 1.3937, 1.3915, 1.3890,
        1.3879, 1.3864, 1.3861, 1.3799, 1.3732, 1.3731, 1.3730, 1.3700, 1.3672,
        1.3601, 1.3565, 1.3551, 1.3544, 1.3531, 1.3508, 1.3486, 1.3478, 1.3467,
        1.3457, 1.3407, 1.3392, 1.3281, 1.3271, 1.3246, 1.3238, 1.3220, 1.3217,
        1.3217, 1.3212, 1.3209, 1.3201, 1.3179, 1.3173, 1.3154, 1.3139, 1.3117,
        1.3085, 1.3069, 1.3063, 1.3061, 1.3059, 1.3037, 1.3030, 1.3022, 1.2983,
        1.2965, 1.2956, 1.2928, 1.2925, 1.2925, 1.2913, 1.2913, 1.2913, 1.2902,
        1.2890, 1.2883, 1.2874, 1.2873, 1.2856, 1.2839, 1.2839, 1.2834, 1.2832,
        1.2826, 1.2826, 1.2824, 1.2824, 1.2823, 1.2821, 1.2820, 1.2815, 1.2805,
        1.2795, 1.2791, 1.2782, 1.2781, 1.2777, 1.2759, 1.2759, 1.2747, 1.2744,
        1.2744], device='cuda:0'),
indices=tensor([ 937417,  234117,  778668,  845525,  

### Implementing a Re-Ranker

- Re-rank the top k=100 results
- Select the top=5 results

In [10]:
top_results_dot_product[1]
top_k_chunks = [text_chunks_and_embedding_df["sentence_chunk"].iloc[int(i)] for i in top_results_dot_product[1]]
# top_k_chunks = [i for i in top_results_dot_product[1]]

In [11]:
top_k_chunks[:5]

 'PolynomialFeatures(degree=10), then it is scaled using a StandardScaler, andfinally the ridge models are applied to the resulting features: this ispolynomial regression with ridge regularization. Note how increasing α leadsto flatter (i.e., less extreme, more reasonable) predictions, thus reducing themodel’s variance but increasing its bias. Figure 4-17. Linear (left) and a polynomial (right) models, both with various levels of ridgeregularizationAs with linear regression, we can perform ridge regression either bycomputing a closed-form equation or by performing gradient descent. Thepros and cons are the same. Equation 4-9 shows the closed-form solution,where A is the (n + 1) × (n + 1) identity matrix,\u2060 except with a 0 in the top-left cell, corresponding to the bias term. Equation 4-9. Ridge regression closed-form solutionθ ^ = (X ⊺ X+αA) -1  X ⊺  yHere is how to perform ridge regression with Scikit-Learn using a closed-form solution (a variant of Equation 4-9 that uses a matrix

In [12]:
from sentence_transformers import CrossEncoder

# Load the model, here we use our base sized model
model = CrossEncoder("mixedbread-ai/mxbai-rerank-base-v1")

In [13]:
results = model.rank(query, top_k_chunks, return_documents=True, top_k=5)

In [14]:
results

[{'corpus_id': 0,
  'score': np.float32(0.9881362),
 {'corpus_id': 1,
  'score': np.float32(0.8694347),
  'text': 'PolynomialFeatures(degree=10), then it is scaled using a StandardScaler, andfinally the ridge models are applied to the resulting features: this ispolynomial regression with ridge regularization. Note how increasing α leadsto flatter (i.e., less extreme, more reasonable) predictions, thus reducing themodel’s variance but increasing its bias. Figure 4-17. Linear (left) and a polynomial (right) models, both with various levels of ridgeregularizationAs with linear regression, we can perform ridge regression either bycomputing a closed-form equation or by performing gradient descent. Thepros and cons are the same. Equation 4-9 shows the closed-form solution,where A is the (n + 1) × (n + 1) identity matrix,\u2060 except with a 0 in the top-left cell, corresponding to the bias term. Equation 4-9. Ridge regression closed-form solutionθ ^ = (X ⊺ X+αA) -1  X ⊺  yHere is how to perf

### Functionizing the semantic pipeline

In [15]:
def retrieve_relevant_resources(query: str,
                              embeddings: torch.tensor,
                              model: SentenceTransformer=embedding_model,
                              n_resources_to_return: int=5,
                              print_time: bool=True):
    """
    Embeds the query with a model and returns the top k scores and indices from the embeddings.
    """
    
    query_embedding = model.encode(query, convert_to_tensor=True)

    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    
    if print_time:
        print_message("INFO", f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                k=n_resources_to_return)
    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources_torch(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [16]:
retrieve_relevant_resources(query="ridge regression", embeddings=embeddings)

[33m[INFO][0m Time taken to get scores on 1765 embeddings: 0.00015 seconds.


(tensor([0.6800, 0.6272, 0.6144, 0.6123, 0.6080], device='cuda:0'),
 tensor([309, 312, 341, 311, 310], device='cuda:0'))

## Connecting to an LLM

In [17]:
# pip install bitsandbytes accelerate
from transformers.utils import is_flash_attn_2_available
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfigfrom transformers import TextStreamer

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)
## Flash attention gpu

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print_message("INFO", f"Using attention implementation: {attn_implementation}")


tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    attn_implementation=attn_implementation
)

[33m[INFO][0m Using attention implementation: sdpa


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layern

#### Getting numbers of parameters of the LLM

In [20]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

print(get_model_num_params(model) / 1000000000)

1.602203904


In [21]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(model)

{'model_mem_bytes': 2192283136, 'model_mem_mb': 2090.72, 'model_mem_gb': 2.04}

In [22]:
input_text = "What is ridge regression"

In [30]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

In [32]:
chat = [
    { "role": "user", "content": input_text },
]

In [33]:
question = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

question = tokenizer(question, return_tensors="pt").to(device)

streamer = TextStreamer(tokenizer, skip_prompt=True)

In [52]:
question["input_ids"].shape

torch.Size([1, 14])

In [34]:
_ = model.generate(**question, streamer=streamer,
                            pad_token_id=tokenizer.eos_token_id,
                            temperature=0.1,
                            max_length=2048,
                            do_sample=True,
                            top_p=0.5,
                            repetition_penalty=1.25)


The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Ridge Regression, also known as Ridge regularization, is a statistical method used to perform linear regression analysis. It's an extension of ordinary least squares (OLS) that adds a penalty term to the cost function during optimization. This penalty term helps prevent overfitting by adding some "regularization" and making it less likely for coefficients to become too large in magnitude.

Here's how it works:

**1. The Problem:** 
   - Linear regression aims to find the best fit line or curve through data points using parameters like slope and intercept.  
   - However, with complex datasets containing many variables, these models can be prone to high variance and overly sensitive to noise.

**2. Introducing Regularization:**
    - **Regularization**: Instead of just minimizing error directly, we add a penalty term proportional to the square of the coefficient values. This prevents extreme parameter estimates from being pushed towards zero due to small fluctuations in training data.

