In [24]:
%pip install openai tiktoken faiss-cpu pypdf2 -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import faiss
import tiktoken
from openai import OpenAI
import numpy as np
import dotenv
import openai

dotenv.load_dotenv()

# Build a tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")


def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))


# Chunk a document by tokens
def chunk_text(text: str, max_tokens=300, overlap=50):
    ids = tokenizer.encode(text=text)

    for i in range(0, len(ids), max_tokens - overlap):
        chunk_ids = ids[i:i + max_tokens]
        yield tokenizer.decode(tokens=chunk_ids)

In [26]:
import PyPDF2


def extract_text_from_pdf(pdf_file_path: str):
    pdf_file_obj = open(pdf_file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)

    num_pages = len(pdf_reader.pages)
    text = ""

    for i in range(num_pages):
        page_obj = pdf_reader.pages[i]
        text += f"<Page-{i}>\n{page_obj.extract_text()}</Page-{i}>\n\n"

    return text


# Example doc --> knowledge base
text = extract_text_from_pdf('./../data/raw/Vaibhav Sethia Senior Machine Learning Engineer.pdf')

In [34]:
chunks = chunk_text(text)

In [35]:
dotenv.load_dotenv()

# Embed the chunks
embedding_dimensions = 1536
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
vectors = []
chunks_list = []

for chunk in chunks:
    out = client.embeddings.create(model="text-embedding-ada-002", input=chunk)
    vectors.append(out.data[0].embedding)
    chunks_list.append(chunk)

In [36]:
vectors
vectors = np.array(vectors).astype("float32")

In [37]:
index = faiss.IndexFlatL2(embedding_dimensions)
index.add(vectors)

In [38]:
# Prepare a query
question = "What work has been done by Vaibhav in Boltic Tables?"

question_vector = client.embeddings.create(model="text-embedding-ada-002", input=question).data[0].embedding

# D - disnces and I - indices
D, I = index.search(np.array([question_vector], dtype="float32"), k=3)

print(D, I)

[[0.37584132 0.37627792 0.4921595 ]] [[0 2 6]]


In [None]:
context = "\n\n".join(chunks_list[i] for i in I[0])

print(context)


# Construct prompt
system = """You are a helpful assistant. Answer only from context. If unsure, say 'I don't know from the docs."""

messages = [{"role": "system", "content": system},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion:{question}\nAnswer:"}]

response = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages, temperature=0.3)

In [None]:
print(response.choices[0].message.content)