Getting Started with Weaviate

In [36]:
import weaviate
import os

In [37]:
client = weaviate.Client(
    url=os.getenv("WEAVIATE_CLUSTER_URL"),
    additional_headers={
        "Authorization": f"Bearer {os.getenv('WEAVIATE_API_KEY')}",
        "Content-Type": "application/json",
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
    },
)

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [39]:
# client = weaviate.connect_to_wcs(
#     # Replace with your Weaviate Cloud URL
#     cluster_url=os.getenv("WEAVIATE_CLUSTER_URL"),
#     auth_credentials=weaviate.auth.AuthApiKey(
#         os.getenv("WEAVIATE_API_KEY")
#     ),  # Replace with your Weaviate Cloud key
#     # Replace with your OpenAI API key
#     headers={"X-OpenAI-Api-key": os.getenv("OPEN_API_KEY")},
# )

To check if everything is set up correctly, run:



In [38]:
client.is_ready()

True

How to Create and Populate a Weaviate Vector Database

In [4]:
import pandas as pd

df = pd.read_csv("jeopardy_questions.csv", nrows=100)

In [5]:
df

Unnamed: 0,show_number,air_date,round,category,value,question,answer
0,4680,12/31/04,Jeopardy!,HISTORY,200.0,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,12/31/04,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200.0,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,12/31/04,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200.0,The city of Yuma in this state has a record av...,Arizona
3,4680,12/31/04,Jeopardy!,THE COMPANY LINE,200.0,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,12/31/04,Jeopardy!,EPITAPHS & TRIBUTES,200.0,"Signer of the Dec. of Indep., framer of the Co...",John Adams
...,...,...,...,...,...,...,...
95,5957,7/6/10,Double Jeopardy!,SEE & SAY,800.0,"Say <a href=""http://www.j-archive.com/media/20...",Oregon
96,5957,7/6/10,Double Jeopardy!,NEWS TO ME,800.0,This car company has been in the news for wide...,Toyota
97,5957,7/6/10,Double Jeopardy!,IN THE DICTIONARY,800.0,"As an adjective, it can mean proper; as a verb...",correct
98,5957,7/6/10,Double Jeopardy!,SCIENCE CLASS,1200.0,The wedge is an adaptation of the simple machi...,plane


Step 1: Create a Schema

In [10]:
class_obj = {
    # Class definition
    "class": "JeopardyQuestionCohere",
    # Property definitions
    "properties": [
        {
            "name": "category",
            "dataType": ["text"],
        },
        {
            "name": "question",
            "dataType": ["text"],
        },
        {
            "name": "answer",
            "dataType": ["text"],
        },
    ],
    # Specify a vectorizer
    "vectorizer": "text2vec-cohere",
    # Module settings
    "moduleConfig": {
        "embed-multilingual-v3.0": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
        },
    },
}

 create the class with the create_class() method.

In [11]:
client.schema.create_class(class_obj)

In [12]:
client.schema.get("JeopardyQuestionCohere")

{'class': 'JeopardyQuestionCohere',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'embed-multilingual-v3.0': {'model': 'ada',
   'modelVersion': '002',
   'type': 'text',
   'vectorizeClassName': False},
  'text2vec-cohere': {'vectorizeClassName': True}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-cohere': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'category',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-cohere': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'question',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'t

Step 2: Import data into Weaviate

At this stage, the vector database has a schema but is still empty. So, let’s populate it with our dataset. This process is also called “upserting”.

In [15]:
from weaviate.util import generate_uuid5

with client.batch(batch_size=200, num_workers=2) as batch:
    for _, row in list(df.iterrows())[:1]:
        question_object = {
            "category": row.category,
            "question": row.question,
            "answer": row.answer,
        }
        batch.add_data_object(
            question_object,
            class_name="JeopardyQuestionCohere",
            uuid=generate_uuid5(question_object),
        )

For a sanity check, you can review the number of imported objects with the following code snippet:

In [16]:
client.query.aggregate("JeopardyQuestionCohere").with_meta_count().do()

{'data': {'Aggregate': {'JeopardyQuestionCohere': [{'meta': {'count': 1}}]}}}

How to Query the Weaviate Vector Database

Let’s retrieve some entries from the JeopardyQuestion class with the get() function to see what they look like.

In [39]:
import json

res = (
    client.query.get("JeopardyQuestionCohere", ["question", "answer"])
    .with_additional(["id", "vector"])
    .with_limit(50)
    .do()
)

res_data = json.dumps(res, indent=4)
print(res_data)

{
    "data": {
        "Get": {
            "JeopardyQuestionCohere": [
                {
                    "_additional": {
                        "id": "d34a1d4d-5812-5180-b95b-460b69000c1d",
                        "vector": [
                            -0.0073928833,
                            0.049468994,
                            0.0061187744,
                            0.00881958,
                            -0.00029706955,
                            -0.008354187,
                            -0.011779785,
                            -0.05847168,
                            -0.0028362274,
                            -0.00869751,
                            -0.017227173,
                            -0.012428284,
                            -0.016677856,
                            -0.022247314,
                            -0.036834717,
                            -0.007369995,
                            -0.02659607,
                            0.024673462,
             

In [43]:
res_list = res.get("data").get("Get").get("JeopardyQuestionCohere")
print(len(res_list))

1


In [44]:
for res_dict in res_list:
    print("====> id ====>",res_dict.get("_additional").get("id"))
    print("===> Vector ====>",res_dict.get("_additional").get("vector"))
    print("===> answer ====>",res_dict.get("answer"))
    print("===> question ====>", res_dict.get("question"))
    print("===> category ====>", res_dict.get("category"))

====> id ====> d34a1d4d-5812-5180-b95b-460b69000c1d
===> Vector ====> [-0.0073928833, 0.049468994, 0.0061187744, 0.00881958, -0.00029706955, -0.008354187, -0.011779785, -0.05847168, -0.0028362274, -0.00869751, -0.017227173, -0.012428284, -0.016677856, -0.022247314, -0.036834717, -0.007369995, -0.02659607, 0.024673462, 0.0072784424, 0.05053711, 0.048553467, 0.004550934, -0.055664062, -0.02545166, -0.017654419, 0.026306152, 0.014755249, -0.02142334, 0.06530762, 0.012748718, 0.028686523, -0.030654907, -0.019332886, 0.02330017, -0.00856781, 0.015823364, 0.009757996, -0.02218628, -0.0006055832, 0.025680542, -0.009124756, 0.013771057, -0.040893555, 0.033569336, -0.05203247, 0.018814087, 0.03475952, 0.0546875, 0.0006942749, 0.05432129, 0.012702942, -0.0033187866, 0.014625549, 0.01675415, -0.004310608, 0.008979797, 0.029220581, 0.02999878, 0.030700684, 0.034698486, 0.0029354095, -0.01676941, -0.0032043457, 0.011436462, 0.016204834, 0.025054932, 0.036590576, 0.019699097, 0.016067505, 0.01751709