In [1]:
import os
import re
import json
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import time
import psutil
import ollama



def load_documents(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") or filename.endswith(".md"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
                text = file.read()
                text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
                documents.append({"filename": filename, "text": text})
    return documents


def chunk_documents(documents, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = []
    for doc in documents:
        split_texts = text_splitter.split_text(doc["text"])
        for chunk in split_texts:
            chunks.append({"text": chunk, "source": doc["filename"]})
    return chunks

docs = load_documents("/Users/avneetsoni/Desktop/ds4300/data")
chunks = chunk_documents(docs, chunk_size=500, chunk_overlap=50)

In [2]:
print(chunks)

[{'text': "FAILURE. In March of 2013 I authored an article predicting the demise of Yahoo CEO Marissa Mayer. Here's the thing - not every talented, successful executive is CEO ready; Mayer is a textbook example of this. It was obvious from the early days, at least to those paying attention, that Mayer didn't have the leadership chops to pull off the admittedly tough assignment of turning around the once iconic Yahoo brand. A question worth asking is, why didn't the Yahoo board of directors recognize this?", 'source': 'case2.txt'}, {'text': "the Yahoo board of directors recognize this? Just this week, Miguel Helft authored a meaty article chronicling Mayer's tenure at Yahoo. Helft was fair in his analysis, pointing out that anyone who assumed the helm at Yahoo had an almost impossible task in front of them. Breathing life back into a company whose business model was built for an era that has long since passed is no easy task. That said, Helft was also fair in highlighting Mayer's dismal

In [3]:
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection("ds4300_notes")

embedding_model = SentenceTransformer("hkunlp/instructor-xl")

for idx, chunk in enumerate(chunks):
    embedding = embedding_model.encode(chunk["text"]).tolist()
    collection.add(
        ids=[str(idx)],
        embeddings=[embedding],
        metadatas=[{"source": chunk["source"]}],
        documents=[chunk["text"]]
    )

print("Documents indexed successfully!")


Insert of existing embedding ID: 0
Add of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 1
Insert of existing embedding ID: 2
Add of existing embedding ID: 2
Insert of existing embedding ID: 3
Add of existing embedding ID: 3
Insert of existing embedding ID: 4
Add of existing embedding ID: 4
Insert of existing embedding ID: 5
Add of existing embedding ID: 5
Insert of existing embedding ID: 6
Add of existing embedding ID: 6
Insert of existing embedding ID: 7
Add of existing embedding ID: 7
Insert of existing embedding ID: 8
Add of existing embedding ID: 8
Insert of existing embedding ID: 9
Add of existing embedding ID: 9
Insert of existing embedding ID: 10
Add of existing embedding ID: 10
Insert of existing embedding ID: 11
Add of existing embedding ID: 11
Insert of existing embedding ID: 12
Add of existing embedding ID: 12
Insert of existing embedding ID: 13
Add of existing embedding ID: 13
Insert of existing embedding ID: 14
Add of existing em

Documents indexed successfully!


In [4]:
def retrieve_context(query, top_k=5):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results["documents"][0]  

query = "when were cartier watches gifted?"
retrieved_context = retrieve_context(query)
print("Retrieved Context:", retrieved_context)


Retrieved Context: ['the context you’re operating in and why it’s important to do things a certain way. Had she done the same thing in a different business, no one would have batted an eye. The second thing though, is board continuity and support. Now the gifts of the Cartier watches were given in 2018 under a different Chairman. And the Board had full knowledge of and had approved the gifting of the watches to those four executives. But when push came to shove, Holgate was the one that took the fall, and the', 'running a commercially competitive business, and after all Holgate is a commercial CEO, but, being owned by government, there’s this perception that you’re spending money on the public purse. Now the total value of the gift of the watches was immaterial like $20,000. And in a company that turns over about $7.5billion in revenue, it’s a drop in the ocean. It means nothing. So how does a commercial CEO keep commercial strength executives motivated and rewarded when they do really

In [5]:

def generate_response(query, context):
    prompt = f"Given the context:\n{context}\n\nAnswer the following question:\n{query}"
    
    response = ollama.chat(model="llama2", messages=[{"role": "user", "content": prompt}])
    
    return response["message"]["content"]

response = generate_response(query, retrieved_context)
print("Generated Response:", response)


Generated Response: According to the passage, Cartier watches were gifted in 2018 under a different Chairman of Australia Post.


In [6]:

start_time = time.time()
retrieve_context("Explain quicksort.")
end_time = time.time()

print(f"Retrieval Time: {end_time - start_time} seconds")
print(f"Memory Usage: {psutil.virtual_memory().percent}%")


Retrieval Time: 17.5263249874115 seconds
Memory Usage: 87.0%
