In [None]:
# download the weaviate client
%pip install -U weaviate-client

In [None]:
import weaviate, os
from weaviate.config import AdditionalConfig, Timeout
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

# Retrieve environment variables
CLUSTER_URL = os.getenv("CLUSTER_URL")
API_KEY = os.getenv("API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Connect to Weaviate
client = weaviate.connect_to_weaviate_cloud(
	cluster_url=CLUSTER_URL,
	auth_credentials=weaviate.auth.AuthApiKey(API_KEY),
	headers={
		"X-OpenAI-Api-Key": OPENAI_API_KEY,
		"X-Cohere-Api-Key": COHERE_API_KEY,
        "X-Goog-Api-Key": GOOGLE_API_KEY
	},
	additional_config=AdditionalConfig(
		timeout=Timeout(init=30, query=60, insert=120)
	)
)

ready = client.is_ready()
server_version = client.get_meta()["version"]
client_version = weaviate.__version__
live = client.is_live()
connected = client.is_connected()

print(f"Weaviate Ready: {ready}")
print(f"Weaviate Client Version: {client_version}")
print(f"Weaviate Server Version: {server_version}")
print(f"Weaviate Live: {client.is_live()}")
print(f"Client Connected: {connected}")

In [None]:
# Create a collection with vector and generative configurations
from weaviate.classes.config import Configure
from weaviate.classes.config import Property, DataType, Tokenization

result = client.collections.create(
    name="ShahinCollection",
    vector_config=Configure.Vectors.text2vec_openai(
        name="shahinembedding",
        model="text-embedding-3-small",
        dimensions=1536
    ),
    generative_config=Configure.Generative.openai(),
    properties=[
        Property(
            name="content",
            data_type=DataType.TEXT,
            description="Description of the property",
            vectorize_property_name=False,
            tokenization=Tokenization.WORD,
            index_searchable=True,
            index_filterable=True
        )
    ]
)

print(f"Collection created successfully: {result.name}")

In [None]:
# Create a collection with pre-provided vectors
from weaviate.classes.config import Configure
from weaviate.classes.config import Property, DataType, Tokenization

result = client.collections.create(
    name="VectorOnlyCollection",
    vector_config=Configure.Vectors.self_provided(),
    properties=[
        Property(
            name="content",
            data_type=DataType.TEXT,
            description="Description of the property",
            vectorize_property_name=False,
            tokenization=Tokenization.WORD,
            index_searchable=True,
            index_filterable=True
        )
    ]
)

print(f"Collection created successfully: {result.name}")

In [None]:
# Create a multi-tenant collection
from weaviate.classes.config import Configure

collection = client.collections.create(
    name="ShahinMultiTenantCollection",
    vector_config=Configure.Vectors.text2vec_openai(
        name="default",
        model="text-embedding-3-small",
        dimensions=1536
    ),
    generative_config=Configure.Generative.openai(),
    multi_tenancy_config=Configure.multi_tenancy(
        enabled=True,
        auto_tenant_creation=True,
    ),
    properties=[
        Property(
            name="title",
            data_type=DataType.TEXT,
            description="Title of the item",
            vectorize_property_name=False,
            tokenization=Tokenization.WORD,
            index_searchable=True,
            index_filterable=True
        ),
        Property(
            name="description",
            data_type=DataType.TEXT,
            description="Description of the item",
            vectorize_property_name=False,
            tokenization=Tokenization.WORD,
            index_searchable=True,
            index_filterable=True
        ),
        Property(
            name="value",
            data_type=DataType.NUMBER,
            description="Numeric value of the item",
            index_filterable=True
        ),
        Property(
            name="status",
            data_type=DataType.TEXT,
            description="Status of the item",
            index_filterable=True
        ),
        Property(
            name="timestamp",
            data_type=DataType.TEXT,
            description="Timestamp of the item",
            index_filterable=True
        )
    ]
)

print(f"Collection created successfully: {collection.name}")

In [None]:
# Create an example collection with optimized Inverted indexing configuration
from weaviate.classes.config import (
    Configure,
    DataType,
    Property,
    Tokenization,
    StopwordsPreset
)

result = client.collections.create(
    "Product",
    properties=[
        Property(
            name="name",
            data_type=DataType.TEXT,
            index_searchable=True,   # Searchable for BM25 "e.g. Smartphones"
            index_filterable=True,   # Filterable for "e.g. Name == X"
            tokenization=Tokenization.WORD
        ),
        Property(
            name="sku_id",
            data_type=DataType.TEXT,
            index_searchable=False,  # OPTIMIZATION: Don't need BM25 on a SKU
            index_filterable=True,   # Need to filter by ID
            tokenization=Tokenization.FIELD # Exact match only
        ),
        Property(
            name="price",
            data_type=DataType.NUMBER,
            index_filterable=True,    # For "Price == 100"
            index_range_filters=True  # For "Price < 50" (Range index)
        ),
        Property(
            name="description",
            data_type=DataType.TEXT,
            index_searchable=True,
            index_filterable=False,   # OPTIMIZATION: We never do "Where description == ..."
        )
    ],
    # Collection-level configuration for metadata and global search behavior
    inverted_index_config=Configure.inverted_index(
        # METADATA INDEXES
        index_null_state=True,        # Allows finding products with missing data
        index_timestamps=True,        # Sort by latest update/creation
        index_property_length=False,  # Optimization: Disable if size filtering isn't needed - as we don't filter by character count
        
        # MAINTENANCE: Compacting "Dead" Data
        cleanup_interval_seconds=60,  # Merges index segments to purge deleted entries every minute
        
        # SEARCH RELEVANCE & STOPWORDS
        stopwords_preset=StopwordsPreset.EN, # Uses standard English list
        stopwords_additions=["promotion", "sale", "seasonal"], # custom noise words for this domain
        stopwords_removals=["the"], # ensure 'the' is actually indexed if needed
        bm25_k1=1.2,
        bm25_b=0.75
    )
)

print(f"Collection created successfully: {result.name}")

In [None]:
# Insert a single object with Consistency level ALL
import weaviate.classes as wvc

# Get the collection
collection = client.collections.get("Product").with_consistency_level(
    wvc.config.ConsistencyLevel.ALL
)

# Insert a single object
uuid = collection.data.insert({
    "name": "Smartphone abc",
    "sku_id": "sku_abc123",
    "price": 799.99,
    "description": "A cutting-edge smartphone with a sleek design"
})

print(uuid)  # the return value is the object's UUID

total_objects = 0
for item in collection.iterator(cache_size=200, include_vector=True):
    total_objects += 1
    print(f"Object: {total_objects}")
    print(item.uuid, item.properties)
print(f"Total objects fetched: {total_objects}")

In [None]:
# Insert a single object with pre-computed vector
import weaviate.classes as wvc

# Get the collection
collection = client.collections.get("VectorOnlyCollection")

# Insert a single object
my_vector = [0.01] * 1536  # Example pre-computed vector of dimension 1536

uuid = collection.data.insert(
    properties={
        "content": "Austria is a country in Europe with a rich cultural heritage."
    },
    vector=my_vector,  # <- precomputed vector here
)

print(uuid)  # the return value is the object's UUID

total_objects = 0
for item in collection.iterator(cache_size=200, include_vector=True):
    total_objects += 1
    print(f"Object: {total_objects}")
    print(item.uuid, item.properties, item.vector)
print(f"Total objects fetched: {total_objects}")

In [None]:
# Extract a collection config, modify it, and recreate

# 1. Get the collection and extract its config
collection = client.collections.use("<Existing_Collection>")
collection_config = collection.config.get()
print(collection_config)

# 2. Convert config to dictionary
config_dict = collection_config.to_dict()
print(config_dict)

# 3. Modify the dictionary (remove properties)
# Filter out unwanted properties from the properties list
config_dict["properties"] = [
    prop for prop in config_dict["properties"] 
    if prop["name"] not in ["<PROP_NAME>", "<PROP_NAME>"]
]

# 4. Change the collection name
config_dict["class"] = "<New_Collection_Name>"

# 5. Delete old collection (if needed)
client.collections.delete("Existing_Collection")

# 6. Create new collection from modified config
new_collection = client.collections.create_from_dict(config_dict)

In [None]:
# Test the Raw Content that pass to the Vectorizer
from weaviate.classes.config import Configure
from weaviate.classes.config import Property, DataType, Tokenization

client.collections.create(
    name="<collection-name>",
    vector_config=Configure.Vectorizer.text2vec_openai(
        base_url="https://webhook.site/<ID>" # Webhook URL from webhook.site
    ),
    generative_config=Configure.Generative.openai(),
    inverted_index_config=Configure.inverted_index(
        index_timestamps = True
    ),
    replication_config=Configure.replication(factor=3, async_enabled=True),
    properties=[
        Property(
            name="<PROPERTY-NAME>",
            data_type=DataType.TEXT,
            tokenization=Tokenization.WORD,
        ),
        Property(
            name="<PROPERTY-NAME>",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD,
        )
    ]
)

coll = client.collections.get("<PROPERTY-NAME>")
result = coll.data.insert(properties={"<PROPERTY-NAME>": "<DATA>", "<PROPERTY-NAME>": "<DATA>"})
print(result)

# In the webhook.site, the Raw Content will be displayed on how it's sent to the Vectorizer for vectorization at Request Content section under Raw Content.

In [None]:
# Using generate_uuid5, you can create a consistent UUID for each object based on its content. 
# When using batch insertion, Weaviate will automatically skip any objects with duplicate UUIDs, 
# ensuring that no duplicate entries are added to the collection.
# For example, during the first run, 100 unique objects might be added. 
# If the same code is run again, no new objects will be added since the UUIDs remain the same.

from tqdm import tqdm
from weaviate.util import generate_uuid5

sample_100 = data_2k[0:100] # Assuming data_2k is a list of 2000 items

coll = client.collections.get("<COLLECTION-NAME>")

with coll.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:
    for item in tqdm(sample_100):
        id = generate_uuid5(item["PROPERTY_ID"])

        batch.add_object(
            item,
            uuid=id
        )

print(f"Object count: {len(coll)}")

In [None]:
# Insert a single object with Consistency level ALL
import weaviate.classes as wvc

# Get the collection
collection = client.collections.get("MyCollection").with_consistency_level(
    wvc.config.ConsistencyLevel.ALL
)

# Insert a single object
uuid = collection.data.insert({
    "MyProperty": "PROPERTY_VALUE"
})

print(uuid)  # the return value is the object's UUID

total_objects = 0
for item in collection.iterator(cache_size=200, include_vector=True):
    total_objects += 1
    vector = item.vector["VECTOR_EMB_VAR"]  # Use the appropriate key if you have named vectors
    print(f"Object: {total_objects}")
    print(item.uuid, item.properties)
    print(f"Vector dimensionality: {len(vector)}")
print(f"Total objects fetched: {total_objects}")

In [None]:
from weaviate.classes.config import Configure, Property, DataType, VectorDistances

result = client.collections.create(
    name="MyDynamicCollection",
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    vector_config=Configure.VectorIndex.dynamic(
        distance_metric=VectorDistances.COSINE, 
        threshold=25000, # Threshold for switching from flat to HNSW
        flat=Configure.VectorIndex.flat(
            distance_metric=VectorDistances.COSINE,
            quantizer=Configure.VectorIndex.Quantizer.bq(cache=True),
            vector_cache_max_objects=1000000,
        ),
        hnsw=Configure.VectorIndex.hnsw(
            distance_metric=VectorDistances.COSINE,
            ef_construction=256,
            max_connections=128,
            quantizer=Configure.VectorIndex.Quantizer.sq(training_limit=50000),
            ef=-1,
            dynamic_ef_factor=15,
            dynamic_ef_min=200,
            dynamic_ef_max=1000,
            vector_cache_max_objects=1000000,
        ),
    ),
)

print(f"Collection created: {result}")