# WeAviate Controller
This simple Weaviate controller in Jupyter Notebook is designed to streamline basic operations on the Weaviate database. It facilitates connecting to the database, creating and deleting collections, and populating these collections with objects sourced from a JSON file. Despite its straightforward nature, this Notebook efficiently covers the essential functionalities required for operating fully functional Weaviate Vector Database.


In next cell, we import the necessary libraries for connecting to a Weaviate instance on our Power PC. We also set up the API key required for accessing OpenAI services (This has to be fulfilled by user).

In [None]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from weaviate.classes.init import Auth

client = weaviate.connect_to_custom(
    http_host="147.175.146.250",
    http_port=8080,
    http_secure=False,
    grpc_host="147.175.146.250",
    grpc_port=50051,
    grpc_secure=False,
    headers={
        "X-OpenAI-Api-Key": 'your-api-key',
    }
)


# Create Collection

Creates collection suitable for our use case.

In [None]:
client.collections.create(
    name="RealEstateListings",
    properties=[
        Property(name="listing_id", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="address", data_type=DataType.TEXT),
        Property(name="type_of_property", data_type=DataType.TEXT),
        Property(name="size_of_property", data_type=DataType.NUMBER),
        Property(name="price", data_type=DataType.NUMBER),
        Property(name="price_per_m", data_type=DataType.NUMBER),
        Property(name="price_per_month", data_type=DataType.NUMBER),
        Property(name="price_per_m_per_month", data_type=DataType.NUMBER),
        Property(name="description_text", data_type=DataType.TEXT),
        Property(name="chunk_index", data_type=DataType.NUMBER),
        Property(name="transaction_type", data_type=DataType.TEXT),
        Property(name="clean_phone_number", data_type=DataType.TEXT),
        Property(name="image", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
    ],
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),
)

# Retrieve Collection

Checks if the collection is sucessfuly created.

In [None]:
collection = client.collections.get("RealEstateListings")

for item in collection.iterator():
    print(item.uuid, item.properties)

# Retrieve number of objects in collection

Checks the number of objects that are currently loaded in our collection.

In [None]:
collection = client.collections.get("RealEstateListings")
response = collection.aggregate.over_all(total_count=True)

print(response.total_count)

# Fill collection with objects

In [None]:
import json
import time
import weaviate
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

def add_listing_chunks(collection, data_rows, delay=0.5):

    print("Starting batch insert with", len(data_rows), "data rows.")
    try:
        with collection.batch.dynamic() as batch:
            for idx, data_row in enumerate(data_rows):
                print(f"Adding object {idx+1}/{len(data_rows)}: Listing ID {data_row.get('listing_id')}")
                batch.add_object(properties=data_row)
                time.sleep(delay)  # Slow down insertion
                
                # Stop batch import if too many errors occur
                if batch.number_errors > 10:
                    print("Batch import stopped due to excessive errors:", batch.number_errors)
                    break

        # Check for failed objects
        failed_objects = collection.batch.failed_objects
        if failed_objects:
            print(f"Number of failed imports: {len(failed_objects)}")
            print("First failed object:", failed_objects[0])
        else:
            print("Batch insert completed successfully with no errors.")
    except Exception as e:
        print("Batch insert failed with error:", e)


# Load Listings.json with debug message
try:
    with open('Listings.json', 'r', encoding='utf-8') as file:
        items = json.load(file)
    print("Loaded Listings.json successfully, total items:", len(items))
except json.JSONDecodeError as e:
    print("Failed to load JSON file due to decode error:", e)
    items = []
except Exception as e:
    print("Failed to load JSON file with error:", e)
    items = []

# Get the Weaviate collection with debug message
try:
    collection = client.collections.get("RealEstateListings")
    print("Successfully retrieved collection 'RealEstateListings'.")
except Exception as e:
    print("Error retrieving collection 'RealEstateListings':", e)
    collection = None

if collection is None:
    print("Exiting script as the collection could not be retrieved.")
else:
    # Process and collect chunks
    all_chunks = []
    for item in items:
        if not item.get("description_text"):
            print(f"Skipping item {item.get('id', 'unknown')} due to missing description.")
            continue

        description_chunks = text_splitter.split_text(item["description_text"])
        print(f"Item {item.get('id', 'unknown')} split into {len(description_chunks)} chunks.")
        
        for idx, chunk in enumerate(description_chunks):
            data_row = {
                "listing_id": str(item["id"]),
                "title": item.get("title", ""),
                "address": item.get("address", ""),
                "type_of_property": item.get("type_of_property", ""),
                "size_of_property": item.get("size_of_property", ""),
                "price": item.get("price", None),
                "price_per_m": item.get("price_per_m", None),
                "price_per_month": item.get("price_per_month", None),
                "price_per_m_per_month": item.get("price_per_m_per_month", None),
                "description_text": chunk,
                "chunk_index": idx,
                "transaction_type": item.get("transaction_type", ""),
                "clean_phone_number": item.get("clean_phone_number", ""),
                "image": item.get("image", ""),
                "url": item.get("url", ""),
            }
            all_chunks.append(data_row)

    print("Total data rows prepared for batch insert:", len(all_chunks))
    
    # Call the batch insert function once with all collected chunks
    if all_chunks:
        add_listing_chunks(collection, all_chunks, delay=0)
    else:
        print("No valid data rows to insert.")


# Delete Collection

Deletes already created collectionl.

In [None]:
client.collections.delete("RealEstateListings")