In [None]:
!pip install chromadb sentence_transformers pandas bs4

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import uuid
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize Chroma client
client = chromadb.PersistentClient(path="./chroma_stackoverflow_db")

collection = client.get_or_create_collection(
    name="stackoverflow_demo",
    metadata={"hnsw:space": "cosine"}
)

In [3]:
data = ['python_questions0.csv']
MAX_DOCS = 1000
df = pd.DataFrame()
for d in data:
    df = pd.concat([df, pd.read_csv(d)], ignore_index=True)

    
df = df.loc[:min(len(df), MAX_DOCS-1), ["tags", "question_title", "question_body", "answer", "question_score"]]
total_docs = len(df)
print(f"Loaded {total_docs} questions")

Loaded 1000 questions


Chunking:

In [4]:
chunks = []
min_code_block = 20

for ix, content in df.iterrows():
    answer = content.loc['answer']
    tags = content.loc["tags"]
    score = content.loc["question_score"]
    chunk = f"Question: {content.loc['question_body']}\n{content.loc['question_body']}\n\Answer: {answer}".lower()
    metadata = {"tags": tags,
                "score": score,
                "code": False
                }
    chunks.append({"chunk": chunk,
                   "metadata": metadata})

    soup = BeautifulSoup(answer, 'html.parser')
    code_blocks = [code.get_text() for code in soup.find_all('code')]
    for block in code_blocks:
        if len(block) > min_code_block:
            chunks.append({"chunk": block.lower(),
                           "metadata": {"tags": tags,
                                        "score": score,
                                        "code": True}})

chunks = pd.DataFrame(chunks)
total_chunks = len(chunks)
print(f"Prepared {total_chunks} chunks.")

Prepared 2282 chunks.


In [5]:
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
def print_progress(current, total, start_time, operation="Processing"):
    elapsed = time.time() - start_time
    percent = current / total
    eta = (elapsed / current) * (total - current) if current > 0 else 0
    print(
        f"\r{operation}: {current}/{total} ({percent:.1%}) | "
        f"Elapsed: {elapsed:.1f}s | ETA: {eta:.1f}s",
        end="", flush=True
    )

In [7]:
BATCH_SIZE = 200
total_added = 0
start_time = time.time()

for batch_num in range(0, total_chunks, BATCH_SIZE):
    batch = chunks.iloc[batch_num:batch_num + BATCH_SIZE]
    
    documents = []
    metadatas = []
    ids = []
    
    for ix, row in batch.iterrows():
        chunk = row["chunk"]
        metadata = row["metadata"]
        documents.append(chunk)
        metadatas.append(metadata)
        ids.append(str(uuid.uuid4()))  # Generate unique UUID for each document
    
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    total_added += len(documents)

    print_progress(min(batch_num + BATCH_SIZE, total_chunks), total_chunks, start_time)


print(f"\n\nSuccessfully added {total_added} documents")
print(f"Total documents in collection: {collection.count()}")
print(f"Total time: {time.time() - start_time:.2f} seconds")

Processing: 2282/2282 (100.0%) | Elapsed: 178.0s | ETA: 0.0s

Successfully added 2282 documents
Total documents in collection: 27164
Total time: 178.02 seconds


In [8]:

results = collection.get()
print(f"Total documents: {len(results['ids'])}")

# Inspect first few items
for i in range(min(3, len(results['ids']))):
    print(f"\nDocument {i+1}:")
    print(f"ID: {results['ids'][i]}")
    print(f"Content: {results['documents'][i][:200]}...")  # First 200 chars
    print(f"Metadata: {results['metadatas'][i]}")

Total documents: 27164

Document 1:
ID: 66134e08-ed73-4f4b-a749-befb61095c94
Content: Deleting DataFrame row in Pandas based on column value
<p>I have the following DataFrame:</p>

<pre><code>             daysago  line_race rating        rw    wrating
 line_date                        ...
Metadata: {'score': 256, 'tags': 'python|pandas'}

Document 2:
ID: 45674e80-94ed-4da7-8fff-880a3e724906
Content: Deleting DataFrame row in Pandas based on column value
<p>I have the following DataFrame:</p>

<pre><code>             daysago  line_race rating        rw    wrating
 line_date                        ...
Metadata: {'score': 256, 'tags': 'python|pandas'}

Document 3:
ID: e86949ab-473b-426b-ae03-b2a55c57782c
Content: What are the differences between numpy arrays and matrices? Which one should I use?
<p>What are the advantages and disadvantages of each?</p>

<p>From what I've seen, either one can work as a replacem...
Metadata: {'tags': 'python|arrays|matrix|numpy', 'score': 256}


In [9]:
# Search for similar questions
query_text = "how to parse json in python"
query_embedding = model.encode(query_text.lower()).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print("\nTop 3 similar questions:")
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nResult {i+1}:")
    print(f"Score: {1 - results['distances'][0][i]:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Tags: {meta['tags']}")


Top 3 similar questions:

Result 1:
Score: 0.68
Content: import json
d =  {'test_0': {'status': 'false', 'test_id': 123453}, 
      'test_1': {'status': 'false', 'test_id': 123453}, 
      'test_2': {'status': 'false', 'test_id': 123453}}

with open('data.j...
Tags: python|json

Result 2:
Score: 0.68
Content: import json
d =  {'test_0': {'status': 'false', 'test_id': 123453}, 
      'test_1': {'status': 'false', 'test_id': 123453}, 
      'test_2': {'status': 'false', 'test_id': 123453}}

with open('data.j...
Tags: python|json

Result 3:
Score: 0.66
Content:   import json
  data = json.loads(datastring)
...
Tags: python|json|pdf


In [10]:
!pip install torch transformers --index-url https://download.pytorch.org/whl/cpu
! pip install accelerate

Looking in indexes: https://download.pytorch.org/whl/cpu


You should consider upgrading via the 'c:\users\38641\documents\faks\5.letnik\2.semester\nlp\ul-fri-nlp-course-project-2024-2025-1-6-3-musketeers\venv\scripts\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\38641\documents\faks\5.letnik\2.semester\nlp\ul-fri-nlp-course-project-2024-2025-1-6-3-musketeers\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
!pip install transformers accelerate

In [None]:
# for testing only
from transformers import AutoTokenizer, AutoModelForCausalLM

class RAG:
    def __init__(self, embedder, collection, retrieve_number=3):
        model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.llm = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
        self.embedder = embedder
        self.retriever = collection
        self.retrieve_number = retrieve_number

    def generate(self, query):
        query_embedding = self.embedder.encode(query.lower()).tolist()
        results = self.retriever.query(query_embeddings=[query_embedding], n_results=self.retrieve_number)
        prompt = self.build_prompt(results)
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cpu")
        outputs = self.llm.generate(**inputs, max_new_tokens=100)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def build_prompt(self, query, results):
        context = self.context_from_results(results)
        return f'''
            Answer the following code related question using the context provided inside triple qoutes in it is useful.
            In the answer provide an example of code that is related to the question.
            If you do not know the answer, say that you do not know. Do not try to invent the solution.
            

            Question: {query}


            Context: ```{context}´´´

            
            Answer: 
            '''

Explain Python list comprehensions:

List comprehensions are a powerful way to create lists in Python. They allow you to create a list by iterating over a list of values, and then modifying the list in-place. Here's an example:

```python
my_list = [1, 2, 3, 4, 5]
my_list_comprehension = [x * 2 for x in my_list]
print(my_list_com
