In [27]:
question = "Who is my favorite dog?"
documents = {
    "I have a dog named Kuper and Browner",
    "I really like Kuper a lot since he is the first dog that I have"
}

import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [28]:
import tiktoken

# o200k_base for latest gpt models
# o200k_harmony for gpt-oss models

def get_num_tokens(input: str, encoding_name: str) -> int:
    encoder = tiktoken.get_encoding(encoding_name=encoding_name)
    num_tokens = (encoder.encode(input))
    print(num_tokens)
    num_tokens = len(num_tokens)
    return num_tokens


print(get_num_tokens(question, "o200k_base"))
for doc in documents:
    print(get_num_tokens(doc, "o200k_base"))

[20600, 382, 922, 8340, 6446, 30]
6
[40, 679, 261, 6446, 11484, 658, 5813, 326, 58223, 1247]
10
[40, 2715, 1299, 658, 5813, 261, 3261, 3630, 501, 382, 290, 1577, 6446, 484, 357, 679]
16


In [42]:
from langchain_openai import OpenAIEmbeddings
embed = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [55]:

import os
import numpy as np

question = "Who is the president of the Philippines?"

doc_vectors = [embed.embed_query(i) for i in documents]
query_vector = embed.embed_query(question)

def cosine_similarity(x: list[float], y: list[float]) -> float:
    """Calculate the cosine similarity between two vectors.

    Args:
        x (list[float]): The document vector
        y (list[float]): The query vector

    Returns:
        float: The cosine similarity between the document and query vectors
    """
    norm_x = x / np.linalg.norm(x)
    print("Norm x: ", norm_x)
    norm_y = y / np.linalg.norm(y)
    dot_product = np.dot(norm_x, norm_y)
    return float(dot_product)


for i in doc_vectors:
    print(cosine_similarity(i, query_vector))

Norm x:  [-5.98798752e-03 -3.43773172e-03  1.94650837e-02 ... -7.52210028e-03
 -9.57273826e-05 -1.86468907e-02]
0.684179821815596
Norm x:  [-0.00517023 -0.00369865 -0.0001241  ... -0.0016933   0.00098872
 -0.00243237]
0.6983064718768973


In [56]:
import numpy as np

def cosine_similarity(x: list[float], y: list[float]) -> float:
    dot_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    return float(dot_product / (norm_x * norm_y))

print(len(doc_vectors))
for i in doc_vectors:
    print(cosine_similarity(i, query_vector))

2
0.6841798218155957
0.6983064718768973


In [65]:
from langchain_text_splitters import CharacterTextSplitter, TokenTextSplitter

chunker = TokenTextSplitter.from_tiktoken_encoder(
    encoding_name="o200k_base",
    chunk_size=3,
    chunk_overlap=2
)

document = []
for i, doc in enumerate(documents):
    results = {
        "document_number" : i,
        "chunks": chunker.split_text(doc)
    }
    document.append(results)
    
print(document)

[{'document_number': 0, 'text': ['I have a', ' have a dog', ' a dog named', ' dog named K', ' named Kuper', ' Kuper and', 'uper and Brow', ' and Browner']}, {'document_number': 1, 'text': ['I really like', ' really like K', ' like Kuper', ' Kuper a', 'uper a lot', ' a lot since', ' lot since he', ' since he is', ' he is the', ' is the first', ' the first dog', ' first dog that', ' dog that I', ' that I have']}]
