In [1]:
import weaviate
import weaviate.classes as wvc
import json
import os

# --- Step 1: CHOOSE YOUR CHUNK FILE HERE ---
# Simply change the file name in this variable to ingest a different set of chunks.
# Options: "output_chunks_semantic.json", "output_chunks.json", or any other chunk file you create.

json_file_to_ingest = "output_chunks_semantic.json"

# -------------------------------------------

# --- Step 2: Connect to Weaviate ---
client = None
try:
    client = weaviate.connect_to_local(port=8083, grpc_port=50052)
    print("✅ Successfully connected to Weaviate.")
except Exception as e:
    print(f"❌ Connection failed: {e}")

# --- Step 3: (Re)Create the Collection ---
if client:
    collection_name = "MyDocumentChunk"
    
    # Delete the collection if it already exists to ensure a fresh start
    if client.collections.exists(collection_name):
        client.collections.delete(collection_name)
        print(f"Collection '{collection_name}' deleted to prepare for new data.")

    # Create the new collection
    my_collection = client.collections.create(
        name=collection_name,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_transformers()
    )
    print(f"✅ Collection '{collection_name}' created successfully.")

    # --- Step 4: Load Data and Ingest ---
    if not os.path.exists(json_file_to_ingest):
        print(f"❌ Error: Data file '{json_file_to_ingest}' not found. Please check the file name.")
    else:
        with open(json_file_to_ingest, "r", encoding="utf-8") as f:
            data = json.load(f)

        print(f"Starting ingestion of {len(data)} chunks from '{json_file_to_ingest}'...")
        
        # Ingest data in batches for efficiency
        with my_collection.batch.dynamic() as batch:
            for item in data:
                # This handles both old and new chunk formats
                properties = {
                    "text": item.get("text", ""),
                    "source_file": item.get("source_file", "unknown")
                }
                batch.add_object(properties=properties)
        
        print(f"✅ Ingestion complete. {my_collection.aggregate.over_all(total_count=True).total_count} objects in the collection.")

    client.close()
    print("Connection to Weaviate closed.")

✅ Successfully connected to Weaviate.
Collection 'MyDocumentChunk' deleted to prepare for new data.
✅ Collection 'MyDocumentChunk' created successfully.


            Use the `vector_config` argument instead.
            


Starting ingestion of 5403 chunks from 'output_chunks_semantic.json'...


{'message': 'Failed to send 8 in a batch of 192', 'errors': {"update inverted indices: put inverted indices props: no bucket for prop 'source_file' found", "update inverted indices: put inverted indices props: no bucket searchable for prop 'source_file' found"}}
{'message': 'Failed to send 8 objects in a batch of 192. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}
{'message': 'Failed to send 21 in a batch of 192', 'errors': {"update inverted indices: put inverted indices props: no bucket searchable for prop 'text' found", "update inverted indices: put inverted indices props: no bucket for prop 'source_file' found", "update inverted indices: put inverted indices props: no bucket searchable for prop 'source_file' found"}}
{'message': 'Failed to send 21 objects in a batch of 192. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


✅ Ingestion complete. 1947 objects in the collection.
Connection to Weaviate closed.
