In [30]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd
import pandas as pd
import time
import uuid
from bs4 import BeautifulSoup

In [2]:
# Initialize Chroma client
client = chromadb.PersistentClient(path="./chroma_stackoverflow_db")

collection = client.get_or_create_collection(
    name="stackoverflow_demo",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
data = ['python_questions0.csv']
df = pd.DataFrame()
for d in data:
    df = pd.concat([df, pd.read_csv(d)], ignore_index=True)

    
df = df.loc[:, ["tags", "question_title", "question_body", "answer", "question_score"]]
total_docs = len(df)
print(f"Loaded {total_docs} questions")

Loaded 201467 questions


In [32]:
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [33]:
def print_progress(current, total, start_time, operation="Processing"):
    elapsed = time.time() - start_time
    percent = current / total
    eta = (elapsed / current) * (total - current) if current > 0 else 0
    print(
        f"\r{operation}: {current}/{total} ({percent:.1%}) | "
        f"Elapsed: {elapsed:.1f}s | ETA: {eta:.1f}s",
        end="", flush=True
    )

In [None]:
BATCH_SIZE = 200
total_added = 0
start_time = time.time()

for batch_num in range(0, total_docs, BATCH_SIZE):
    batch = df.iloc[batch_num:batch_num + BATCH_SIZE]
    
    documents = []
    metadatas = []
    ids = []
    
    for ix, row in batch.iterrows():
        chunk = row["chunk"]
        metadata = row["metadata"]
        documents.append(chunk)
        metadatas.append(metadata)
        ids.append(str(uuid.uuid4()))  # Generate unique UUID for each document
    
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    total_added += len(documents)

    print_progress(min(batch_num + BATCH_SIZE, total_chunks), total_chunks, start_time)


print(f"\n\nSuccessfully added {total_added} documents")
print(f"Total documents in collection: {collection.count()}")
print(f"Total time: {time.time() - start_time:.2f} seconds")

Processing: 201467/201467 (100.0%) | Elapsed: 2227.0s | ETA: 0.0sss

Successfully added 201467 documents
Total documents in collection: 363069
Total time: 2227.01 seconds


In [None]:

results = collection.get()
print(f"Total documents: {len(results['ids'])}")

# Inspect first few items
for i in range(min(3, len(results['ids']))):
    print(f"\nDocument {i+1}:")
    print(f"ID: {results['ids'][i]}")
    print(f"Content: {results['documents'][i][:200]}...")  # First 200 chars
    print(f"Metadata: {results['metadatas'][i]}")

Total documents: 363069

Document 1:
ID: 18172851
Content: Deleting DataFrame row in Pandas based on column value
<p>I have the following DataFrame:</p>

<pre><code>             daysago  line_race rating        rw    wrating
 line_date                        ...
Metadata: {'score': 256, 'tags': 'python|pandas'}

Document 2:
ID: 4151128
Content: What are the differences between numpy arrays and matrices? Which one should I use?
<p>What are the advantages and disadvantages of each?</p>

<p>From what I've seen, either one can work as a replacem...
Metadata: {'tags': 'python|arrays|matrix|numpy', 'score': 256}

Document 3:
ID: 4450592
Content: Is there a Python equivalent to Ruby's string interpolation?
<p>Ruby example:</p>

<pre><code>name = "Spongebob Squarepants"
puts "Who lives in a Pineapple under the sea? \n#{name}."
</code></pre>

<p...
Metadata: {'score': 256, 'tags': 'python|string-interpolation|language-comparisons'}


In [None]:
# Search for similar questions
query_text = "how to parse json in python"
query_embedding = model.encode(query_text.lower()).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print("\nTop 3 similar questions:")
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nResult {i+1}:")
    print(f"Score: {1 - results['distances'][0][i]:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Tags: {meta['tags']}")


Top 3 similar questions:

Result 1:
Score: 0.72
Content: Parsing formatted JSON with Python
<p>I want to parse JSON. Its ok if I write JSON in one line</p>

<pre><code>json_input = '{ "rate_of_climbing": 18.4, "speed_factor": 520}'
</code></pre>

<p>But if ...
Tags: python|json|parsing|formatting

Result 2:
Score: 0.72
Content: Parsing formatted JSON with Python
<p>I want to parse JSON. Its ok if I write JSON in one line</p>

<pre><code>json_input = '{ "rate_of_climbing": 18.4, "speed_factor": 520}'
</code></pre>

<p>But if ...
Tags: python|json|parsing|formatting

Result 3:
Score: 0.71
Content: How to parse JSON file for a specific key and value?
<p>Currently, have multiple JSON files to parse. In each file, I would look for <code>“ID”: “1”</code>:</p>

<pre><code>{“TYPE”: “fire”, “ID”: “1”,...
Tags: python|json|parsing
