In [None]:
# download the weaviate client
%pip install -U weaviate-client

In [None]:
import weaviate, os
from weaviate.config import AdditionalConfig, Timeout
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Retrieve environment variables
CLUSTER_URL = os.getenv("CLUSTER_URL")
API_KEY = os.getenv("API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Connect to Weaviate
client = weaviate.connect_to_weaviate_cloud(
	cluster_url=CLUSTER_URL,
	auth_credentials=weaviate.auth.AuthApiKey(API_KEY),
	headers={
		"X-OpenAI-Api-Key": OPENAI_API_KEY,
		"X-Cohere-Api-Key": COHERE_API_KEY,
        "X-Goog-Api-Key": GOOGLE_API_KEY
	},
	additional_config=AdditionalConfig(
		timeout=Timeout(init=30, query=60, insert=120)
	)
)

ready = client.is_ready()
server_version = client.get_meta()["version"]
client_version = weaviate.__version__
live = client.is_live()
connected = client.is_connected()

print(f"Weaviate Ready: {ready}")
print(f"Weaviate Client Version: {client_version}")
print(f"Weaviate Server Version: {server_version}")
print(f"Weaviate Live: {client.is_live()}")
print(f"Client Connected: {connected}")

In [None]:
# --- Collection Count Check ---

# Retrieve the list of all collection names
_all = client.collections.list_all()              # dict: {name: _CollectionConfigSimple, ...}
all_collections = list(_all.keys())               # list[str]
number_of_collections = len(all_collections)

print(f"Number of collections: {number_of_collections}")

# Warn if the number of collections is above a typical threshold. 
# This can indicate an anti-pattern in schema design.
if number_of_collections > 100:
    print("Warning: You have a lot of collections.")
    print("Consider using Weaviate Multitenancy (multi-tenant collection) for better scalability,")
    print("especially if many collections are configured identically but represent different users, groups etc... ")
else:
    print("You have a reasonable number of collections.")

# --- End Collection Count Check ---

In [None]:
import requests

ALLOWED_DELETE = {"TimeBasedResolution", "DeleteOnConflict"}

try:
	resp = requests.get(f"{CLUSTER_URL}/v1/schema", headers={"Authorization": f"Bearer {API_KEY}"}, timeout=15)
	resp.raise_for_status()
	schema = resp.json()
except Exception as e:
	print("Failed to fetch schema:", e)
	schema = {}

rows = []
for cls in schema.get("classes", []):
	name = cls.get("class")
	rep = cls.get("replicationConfig", {}) or {}
	async_enabled = rep.get("asyncEnabled")
	deletion_strategy = rep.get("deletionStrategy") or rep.get("deletion_strategy")
	factor = rep.get("factor")

	ok_async = (async_enabled is True)
	ok_delete = deletion_strategy in ALLOWED_DELETE

	rows.append(
		f"- {name}: factor={factor if factor is not None else 'N/A'} | "
		f"async={'True' if async_enabled is True else str(async_enabled)}"
		f"{' (OK)' if ok_async else ' -> ENABLE True'} | "
		f"delete={deletion_strategy}"
		f"{' (OK)' if ok_delete else ' -> SET TimeBasedResolution/DeleteOnConflict'}"
	)

for line in rows:
	print(line)

print()
print("Note: replication factor should match number of data nodes. 3 is common; if")
print("you run 5, 7, or 9 nodes, set factor to 5, 7, or 9 respectively.")
print()
print("Deletion resolution strategies:")
print("- TimeBasedResolution: last write wins using timestamps (delete/create/update).")
print("- DeleteOnConflict: any replica reporting a delete deletes it cluster-wide.")
print("Use TimeBasedResolution for last-write-wins; use DeleteOnConflict to always honor deletes.")


In [None]:
# --- Configuration Similarity Check (REST schema) ---
import json
import requests

try:
	resp = requests.get(f"{CLUSTER_URL}/v1/schema", headers={"Authorization": f"Bearer {API_KEY}"}, timeout=15)
	resp.raise_for_status()
	schema = resp.json()
except Exception as e:
	print("Failed to fetch schema:", e)
	schema = {"classes": []}


def normalize_for_grouping(cls_obj):
	"""Pick only core knobs and serialize deterministically for grouping.
	Includes: replicationConfig, invertedIndexConfig, shardingConfig, vectorConfig (incl. vectorizer details).
	"""
	rep = cls_obj.get("replicationConfig", {}) or {}
	inv = cls_obj.get("invertedIndexConfig", {}) or {}
	shard = cls_obj.get("shardingConfig", {}) or {}
	vec = cls_obj.get("vectorConfig", {}) or {}

	# Normalize vector config including provider details
	norm_vec = {}
	for name, cfg in sorted(vec.items()):
		v = cfg.get("vectorizer") or {}
		if isinstance(v, dict) and v:
			provider = next(iter(v.keys()))
			details = v.get(provider) or {}
			norm_vec[name] = {
				"provider": provider,
				"model": details.get("model"),
				"baseURL": details.get("baseURL"),
				"vectorizeClassName": details.get("vectorizeClassName"),
				"properties": details.get("properties", []),
				"vectorIndexType": cfg.get("vectorIndexType"),
				"vectorIndexConfig": cfg.get("vectorIndexConfig", {}),
			}
		else:
			norm_vec[name] = {"provider": v, "vectorIndexType": cfg.get("vectorIndexType"), "vectorIndexConfig": cfg.get("vectorIndexConfig", {})}

	norm = {
		"replicationConfig": rep,
		"invertedIndexConfig": inv,
		"shardingConfig": shard,
		"vectorConfig": norm_vec,
	}
	# Deterministic signature
	return json.dumps(norm, sort_keys=True, separators=(",", ":"))

classes = schema.get("classes", [])
if not classes:
	print("No collections found.")
else:
	groups = {}
	for c in classes:
		name = c.get("class")
		sig = normalize_for_grouping(c)
		groups.setdefault(sig, []).append(name)

	if len(groups) == 1:
		only_group = next(iter(groups.values()))
		print("All collections share identical core settings (replication, inverted index, sharding, vector/vectorizer).")
		print("These look like candidates for a single multi-tenant collection.")
		print("Collections:", ", ".join(sorted(only_group)))
	else:
		print(f"Found {len(groups)} configuration groups:")
		for idx, names in enumerate(groups.values(), start=1):
			print(f"  Group {idx} ({len(names)}):", ", ".join(sorted(names)))

# --- End Configuration Similarity Check ---


In [None]:
# --- Replication & Compression Check (REST schema) ---
import requests

try:
	resp = requests.get(f"{CLUSTER_URL}/v1/schema", headers={"Authorization": f"Bearer {API_KEY}"}, timeout=15)
	resp.raise_for_status()
	schema = resp.json()
except Exception as e:
	print("Failed to fetch schema:", e)
	schema = {"classes": []}

for cls in schema.get("classes", []):
	name = cls.get("class")
	rep = cls.get("replicationConfig", {}) or {}
	factor = rep.get("factor")

	# Determine compression:
	# 1) Class-level vectorIndexConfig (as in text2vec-openai/simple vectorizer cases)
	compression_enabled = False
	which = []
	vic_root = cls.get("vectorIndexConfig", {}) or {}
	for key in ("bq", "pq", "rq", "sq"):
		sub = vic_root.get(key) or {}
		if sub.get("enabled") is True:
			compression_enabled = True
			which.append(key)

	# 2) Named vectors under vectorConfig (as in named vectors setups)
	vec_cfg = cls.get("vectorConfig", {}) or {}
	for _vname, vinfo in vec_cfg.items():
		vic = (vinfo or {}).get("vectorIndexConfig", {}) or {}
		for key in ("bq", "pq", "rq", "sq"):
			sub = vic.get(key) or {}
			if sub.get("enabled") is True:
				compression_enabled = True
				which.append(key)

	# Recommendations
	if factor == 1:
		rep_rec = "use >= 3 and match number of data nodes (3/5/7/9)"
	elif factor == 3:
		rep_rec = "OK for 3-node clusters; if >3 nodes, match node count"
	elif factor in {5, 7, 9}:
		rep_rec = f"OK if cluster has {factor} data nodes; otherwise match node count"
	else:
		rep_rec = "match replication factor to number of data nodes"

	comp_rec = ("OK" if compression_enabled else "enable one of bq/pq/rq/sq for efficiency")

	print(f"Collection {name}:")
	print(f"replication factor is {factor} -- recommendation is {rep_rec}")
	print(f"Compression is {compression_enabled}{(' (' + ','.join(sorted(set(which))) + ')') if which else ''} -- Recommendation is {comp_rec}")
	print()
# --- End Replication & Compression Check ---
