In [4]:
import pandas as pd
import os
import json

directory = "/Users/rsaran/extracted_ideas/"
files = os.listdir(directory)

for file in files:
    file_path = os.path.join(directory, file)
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            print(f.read())
            break
    except Exception as e:
        print(f"Error reading {file}: {e}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rsaran/extracted_ideas/'

In [2]:
import json
import pandas as pd
import os

items = []
directory = "/Users/rsaran/extracted_ideas/"
files = os.listdir(directory)

for file in files:
    try:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            content = json.load(f)
            items.append({
                "text_input": ", ".join(content[0]['materials_required']),
                "output": {
                    "description": content[0]['description'],
                    "steps": content[0]['steps']
                }
            })
    except Exception as e:
        print(f"Error processing {file}: {e}")
        continue

    break

df = pd.DataFrame(items)
df.to_csv('save.csv', index=False)

# Verify the output
if items:
    print("Sample output format:")
    print(df.iloc[0].to_dict())

Sample output format:
{'text_input': 'Clear nail polish, Black sharpie, Nail polish in a color of your choice, A needle', 'output': {'description': 'This project allows you to create custom Rebel Alliance earbuds to show your allegiance to the Rebel cause in the Star Wars universe.', 'steps': ['Draw the Rebel Alliance symbol on the back of the earbuds using a black sharpie, being as precise as possible.', 'Carefully apply nail polish (in your chosen color) over the sharpie design using a needle.', 'Allow the colored nail polish to dry completely, then cover the entire back of the earbud with a coat of clear nail polish to seal the design and prevent chipping.', 'Enjoy your custom Rebel Alliance earbuds by listening to Star Wars music.']}}


In [None]:
import chromadb
import json
import pandas as pd
from typing import List, Dict

def setup_chroma_collection(collection_name: str = "ideas-collection") -> chromadb.Collection:
    """Initialize ChromaDB client and create/get collection."""
    client = chromadb.Client()
    
    try:
        collection = client.get_collection(collection_name)
        print(f"Retrieved existing collection: {collection_name}")
    except:
        collection = client.create_collection(collection_name)
        print(f"Created new collection: {collection_name}")
    
    return collection

def load_data_from_csv(filepath: str = 'save.csv') -> pd.DataFrame:
    """Load data from CSV file."""
    return pd.DataFrame(pd.read_csv(filepath))

def prepare_documents(df: pd.DataFrame) -> tuple[List[str], List[Dict], List[str]]:
    """Prepare documents, metadata, and IDs for ChromaDB while skipping invalid rows."""
    documents = []
    metadatas = []
    ids = []

    for i, row in df.iterrows():
        text = row.get('text_input', '')
        output = row.get('output', {})

        # Skip invalid entries
        if not isinstance(text, str) or not text.strip():
            print(f"Skipping row {i} due to invalid text_input")
            continue
        
        try:
            metadata = json.loads(output) if isinstance(output, str) else output
            if not isinstance(metadata, dict):
                raise ValueError
        except (json.JSONDecodeError, ValueError):
            print(f"Skipping row {i} due to invalid output format")
            continue

        documents.append(text)
        metadatas.append(metadata)
        ids.append(f"doc_{len(ids)}")

    return documents, metadatas, ids

def add_to_chroma(collection: chromadb.Collection, 
                  documents: List[str], 
                  metadatas: List[Dict], 
                  ids: List[str]) -> None:
    """Add documents to ChromaDB collection."""
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    print(f"Added {len(documents)} documents to the collection")

def query_chroma(collection: chromadb.Collection, 
                 query_text: str, 
                 n_results: int = 2) -> Dict:
    """Query the ChromaDB collection."""
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results
    )
    return results

def main():
    collection = setup_chroma_collection()
    
    df = load_data_from_csv()

    documents, metadatas, ids = prepare_documents(df)

    add_to_chroma(collection, documents, metadatas, ids)
    
if __name__ == "__main__":
    main()

Retrieved existing collection: ideas-collection
Skipping row 13 due to invalid text_input
Skipping row 21 due to invalid text_input
Skipping row 30 due to invalid text_input
Skipping row 45 due to invalid text_input
Skipping row 60 due to invalid text_input
Skipping row 63 due to invalid text_input
Skipping row 65 due to invalid text_input
Skipping row 120 due to invalid text_input
Skipping row 165 due to invalid text_input
Skipping row 169 due to invalid text_input
Skipping row 173 due to invalid text_input
Skipping row 174 due to invalid text_input
Skipping row 186 due to invalid text_input
Skipping row 205 due to invalid text_input
Skipping row 222 due to invalid text_input
Skipping row 226 due to invalid text_input
Skipping row 240 due to invalid text_input
Skipping row 269 due to invalid text_input
Skipping row 272 due to invalid text_input
Skipping row 277 due to invalid text_input
Skipping row 291 due to invalid text_input
Skipping row 293 due to invalid text_input
Skipping row 

In [None]:
query = "Sample query text"
results = query_chroma(collection, query)

print(results)