In [1]:
import weaviate
import weaviate.classes as wvc
import json
import os
import time

# --- 1. CONFIGURATION ---
files_to_ingest = [
    "chunks_with_embeddings.json",
    "chunks_with_embeddings2.json",
    "chunks_with_embeddings3.json",
    "chunks_with_embeddings4.json",
    "ICMRBreastCancer_noun_chunks_with_embeddings.json",
    "ICMRBuccalMucosaCancer_noun_chunks.json",
    "ICMRCervixCancer_noun_chunks_with_embeddings.json"
    
]

# --- 2. CONNECT AND RECREATE COLLECTION ---
client = None
try:
    client = weaviate.connect_to_local(port=8083, grpc_port=50052)
    print(" Successfully connected to Weaviate.")
    
    collection_name = "MyDocumentChunk"
    
    # Delete and recreate the collection to ensure the new schema is applied
    if client.collections.exists(collection_name):
        client.collections.delete(collection_name)
        print(f"Collection '{collection_name}' deleted for a fresh start.")

    # Configure the collection with the 'source_file' property
    my_collection = client.collections.create(
        name=collection_name,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_transformers(),
        properties=[
            wvc.config.Property(name="text", data_type=wvc.config.DataType.TEXT),
            # CORRECTED: Schema defines 'source_file'
            wvc.config.Property(name="source_file", data_type=wvc.config.DataType.TEXT)
        ]
    )
    print(f" Collection created with 'text' and 'source_file' properties.")

    # --- 3. LOAD AND INGEST DATA ---
    all_chunks_to_ingest = []
    
    for file_name in files_to_ingest:
        try:
            with open(file_name, "r", encoding="utf-8") as f:
                data = json.load(f)
                all_chunks_to_ingest.extend(data)

        except Exception as e:
            print(f" Error reading '{file_name}': {e}")
    
    if all_chunks_to_ingest:
        print(f" Starting ingestion for {len(all_chunks_to_ingest)} chunks...")
        
        with my_collection.batch.dynamic() as batch_manager:
            for i, item in enumerate(all_chunks_to_ingest):
                if item.get("text") and item.get("source_file"):
                    batch_manager.add_object(
                        properties={
                            "text": item["text"],
                            # CORRECTED: Inserting the 'source_file' value
                            "source_file": item["source_file"] 
                        }
                    )
        
        total_objects = my_collection.aggregate.over_all(total_count=True).total_count
        print(f" Ingestion complete. Total objects: {total_objects}")

finally:
    if client and client.is_connected():
        client.close()
        print("\nConnection closed.")

 Successfully connected to Weaviate.
Collection 'MyDocumentChunk' deleted for a fresh start.


            Use the `vector_config` argument instead.
            


 Collection created with 'text' and 'source_file' properties.
 Starting ingestion for 22415 chunks...
 Ingestion complete. Total objects: 22415

Connection closed.
