In [5]:
import time
import ollama
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import tiktoken
from qdrant_client import QdrantClient
from qdrant_client.http import models
import os

CHUNK_SIZES = [500]
CHUNK_OVERLAPS = [50]
EMBEDDING_MODEL = "hkunlp/instructor-xl"
TOP_K = 3
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = "notes"

qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

try:
    qdrant_client.get_collection(COLLECTION_NAME)
except Exception:
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=768,  # Embedding size for hkunlp/instructor-xl
            distance=models.Distance.COSINE,
        ),
    )

embedding_model = SentenceTransformer(EMBEDDING_MODEL, device='cpu')

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = "\n".join([page.extract_text() or "" for page in reader.pages])
    return text


def chunk_text(text, chunk_size, overlap):

    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(encoding.decode(chunk))

    return chunks


def embed_and_store_chunks(chunks, chunk_size, overlap):

    embeddings = embedding_model.encode(chunks, normalize_embeddings=True)


    points = [
        models.PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload={
                "text": chunk,
                "chunk_size": chunk_size,
                "overlap": overlap,
            },
        )
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
    ]

    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=points,
    )


def retrieve_relevant_chunks(query, top_k=TOP_K):

    query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0]

    results = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=top_k,
    )

    return [hit.payload["text"] for hit in results] if results else []


def generate_response(model, context, query):

    prompt = f"Given the context:\n{context}\n\nAnswer the following question:\n{query}"

    response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])

    return response["message"]["content"]



In [13]:
if __name__ == "__main__":
    for PDF_PATH in os.listdir("/Users/avneetsoni/Desktop/ds4300/43 midterm data"):
        print(PDF_PATH)
        if PDF_PATH.endswith(".pdf"):
            text = extract_text_from_pdf("/Users/avneetsoni/Desktop/ds4300/43 midterm data/"+PDF_PATH)
            print(f"Extracted {len(text)} characters from PDF\n")

        for chunk_size in CHUNK_SIZES:
            for overlap in CHUNK_OVERLAPS:
                chunks = chunk_text(text, chunk_size, overlap)

                start_time = time.time()
                embed_and_store_chunks(chunks, chunk_size, overlap)

B-trees.pdf
Extracted 4653 characters from PDF

04 - Data Replication.pdf
Extracted 6028 characters from PDF

.DS_Store
09 - Introduction to Graph Data Model.pdf
Extracted 3512 characters from PDF

module3-6.pdf
Extracted 31374 characters from PDF

02 - Foundations.pdf
Extracted 4458 characters from PDF

10 - Neo4j.pdf
Extracted 5097 characters from PDF

05 - NoSQL Intro + KV DBs.pdf
Extracted 10781 characters from PDF

_Modules 7-10 Notes.pdf
Extracted 14583 characters from PDF

07 - Document DBs and Mongo.pdf
Extracted 7038 characters from PDF

Data Structures.pdf
Extracted 13461 characters from PDF

ICS 46 Spring 2022, Notes and Examples_ AVL Trees.pdf
Extracted 23600 characters from PDF

04-B+Tree Walkthrough.pdf
Extracted 2088 characters from PDF

03 - Moving Beyond the Relational Model.pdf
Extracted 7867 characters from PDF

06 - Redis + Python.pdf
Extracted 3297 characters from PDF

C12-bst.pdf
Extracted 8820 characters from PDF

12.6. B-Trees — CS3 Data Structures & Algorithms.

In [14]:
while True:
    query = input("\nEnter your query (or type 'exit' to quit): ")
    if query.lower() == "exit":
        print("Exiting...")
        break

    retrieved_chunks = retrieve_relevant_chunks(query)

    if retrieved_chunks:

        response = generate_response("mistral", "\n".join(retrieved_chunks), query)
        print("\nMistral Response:\n", response)
    else:
        print("No relevant chunks found for the query.")


Enter your query (or type 'exit' to quit):  what is isolation property


  results = qdrant_client.search(



Mistral Response:
  The provided text does not discuss any specific "isolation property" in the context given. However, the term "Isolation Level" or simply "Isolation" is commonly used in databases to describe a property that prevents uncommitted data modifications from conflicting with other transactions. This is important in multi-user systems where multiple users can access and modify the same database concurrently.

In the context of this text, it seems there are no transactions or databases mentioned, so I would not infer that any isolation property is being discussed here. If you're referring to an "isolation property" from a different context, please provide more information for a better answer.



Enter your query (or type 'exit' to quit):  what is acid



Mistral Response:
  ACID stands for Atomicity, Consistency, Isolation, and Durability. It is a set of properties that ensure reliable database transactions by guaranteeing data integrity during and after database operations.

1. Atomicity - A transaction is an uninterruptible series of operations that are executed as a single, indivisible unit. Either all the operations in a transaction succeed or none of them do; if any part of the transaction fails, the entire transaction must be rolled back to its previous state before the transaction started.

2. Consistency - A database is consistent when it remains in a valid state after a transaction completes. A transaction should maintain and uphold all defined constraints and rules, ensuring that the data remains valid and accurate according to the business logic of the application.

3. Isolation - Each transaction should be isolated from other transactions so that one transaction does not interfere with the concurrent execution of another t


Enter your query (or type 'exit' to quit):  hortizontal vs vertical scalability



Mistral Response:
  Horizontal scalability refers to the ability of a system to handle more workload by adding more resources (such as servers or nodes) horizontally. In the context provided, horizontal scalability could refer to the ability of a database management system to scale up by adding more storage devices or machines to accommodate larger data sets and increased query rates.

Vertical scalability, on the other hand, refers to the addition of resources (such as memory, CPU, or storage capacity) to a single machine to increase its processing power. In the context of B-trees discussed in the passage, vertical scalability could be achieved by increasing the hardware specifications of the machine running the database management system to handle larger and more complex data structures like B+ trees, which require more computational resources for efficient search, insertion, and deletion operations.

However, it is important to note that, as mentioned in the passage, implementing m


Enter your query (or type 'exit' to quit):  what is a distributed system



Mistral Response:
  A Distributed System is a collection of independent computers that appear to its users as a single cohesive system. These computers communicate and coordinate with each other over a network to achieve a common goal or provide a specific service. In a distributed system, components located on the network interact with each other in order to deliver a service or perform a task that is beyond the capabilities of any single component. This concept allows for a more scalable, fault-tolerant, and high-performance system compared to a centralized system.

The key characteristics of distributed systems include:

1. Geographical distribution: Components are located in different geographical areas, often continents apart.
2. Heterogeneity: Components can be made up of different hardware, software, and network configurations.
3. Autonomy: Each component can make decisions independently based on the needs of the system as a whole.
4. Scalability: Distributed systems can handle


Enter your query (or type 'exit' to quit):  what is binary search tree



Mistral Response:
  A binary search tree (BST) is a type of tree data structure in which each node has at most two children, referred to as the left child and right child. The nodes are arranged in such a way that for any given node, all the keys in its left subtree are less than the key of the node itself, and all the keys in the right subtree are greater than the key of the node. This property allows for efficient search, insert, and delete operations.

In your provided context, it appears you're discussing a B+ tree, not a binary search tree (BST). A B+ tree is an extension of the B-tree where all non-leaf nodes are stored in ascending order. The keys store the range of data records in their subtrees, and only leaf nodes contain actual data records. This structure allows for efficient range queries.

In contrast, a binary search tree (BST) does not have this extra organization of keys in ascending order in non-leaf nodes. Instead, it uses the binary search algorithm to quickly find


Enter your query (or type 'exit' to quit):  exit


Exiting...
