# Collection setup and data load

## Get keys and urls

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print(WEAVIATE_URL)
print(WEAVIATE_KEY)
print(OPENAI_API_KEY)

if(WEAVIATE_URL == "UPDATE_ME_WEAVIATE_URL"):
    raise Exception("Please update .env and Restart the notebook (see Restart button, next to Run All)")

https://3gr2a6vetrmhdtkhud1ryg.c0.europe-west3.gcp.weaviate.cloud
vkZQeDJjZKaPKIOcQDssGPARPsAjzV9YpqsH
sk-cWzmxOAnwcNG3ii1cmm44maziAEovRo4vT5f-aGy3dT3BlbkFJ8QHwsLfRWkNVnXhpyCAuxsE3qRMBfAFkKAh5xKibUA


## Connect to Weaviate

You need to pass in your OpenAI key, which will be used to vectorise your data.

In [2]:
import weaviate
from weaviate.classes.init import Auth
# from weaviate.classes.init import AdditionalConfig, Timeout

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),

    headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY
    },

    # additional_config=AdditionalConfig(
    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds
    # )
)

client.is_ready()



True

## Create a collection with a vectorizer

* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)
* [OpenAI integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)

Examples of other embedding models:
* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)
* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)
* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)

In [3]:
from weaviate.classes.config import Configure

if client.collections.exists("Jeopardy"):
    client.collections.delete("Jeopardy")

# Create a collection - with OpenAI vectorizer
client.collections.create(
    name="Jeopardy",
    # TODO: add text2vec_openai vectorizer - with:
    # * model - text-embedding-3-small
    vectorizer_config=Configure.Vectorizer.text2vec_openai(model="text-embedding-3-small"),  # Specify the OpenAI embedding model
)

<weaviate.collections.collection.sync.Collection at 0x7ad8a4222480>

## Import data
### Sample Data

In [4]:
import json

with open("./jeopardy_tiny.json") as file:
    data_10 = json.load(file)

print(json.dumps(data_10[0:2], indent=2))

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  }
]


### Insert Many

> `insert_many` is only used for inserting small batches of data - must complete within the timeout.

[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)

In [5]:
# Insert data

# TODO: get Jeopardy collection
# TODO: insert data_10

jeopardy = client.collections.get("Jeopardy")
jeopardy.data.insert_many(data_10)

BatchObjectReturn(_all_responses=[UUID('e3b2d3a8-3ed1-4d4e-b76b-d9b5dca68d18'), UUID('10b708fb-a63e-47d7-8b6e-7ae95dae54dd'), UUID('6df7f503-0701-455d-adae-32d8ed26ccab'), UUID('5d991570-f737-4040-81e3-c129d5e8de88'), UUID('b7178488-72f3-4977-85b1-6602219e2e4b'), UUID('e423a47e-3b2e-4aec-a97b-a8f26cb24bfb'), UUID('182fe2d6-7d28-4d11-987e-2aa880bd7a90'), UUID('f0c98346-b87a-4d36-9167-dadda4d2721e'), UUID('268b6240-48db-4816-a330-f43f2033545b'), UUID('b660cc77-dc20-4bc6-8967-9b96fb730bea')], elapsed_seconds=1.0024993419647217, errors={}, uuids={0: UUID('e3b2d3a8-3ed1-4d4e-b76b-d9b5dca68d18'), 1: UUID('10b708fb-a63e-47d7-8b6e-7ae95dae54dd'), 2: UUID('6df7f503-0701-455d-adae-32d8ed26ccab'), 3: UUID('5d991570-f737-4040-81e3-c129d5e8de88'), 4: UUID('b7178488-72f3-4977-85b1-6602219e2e4b'), 5: UUID('e423a47e-3b2e-4aec-a97b-a8f26cb24bfb'), 6: UUID('182fe2d6-7d28-4d11-987e-2aa880bd7a90'), 7: UUID('f0c98346-b87a-4d36-9167-dadda4d2721e'), 8: UUID('268b6240-48db-4816-a330-f43f2033545b'), 9: UUID('b

### Data preview

In [6]:
# Show data preview
jeopardy = client.collections.get("jeopardy")

# TODO: fetch 4 objects
response = jeopardy.query.fetch_objects(limit=4)

for item in response.objects:
    print(item.uuid, item.properties)

10b708fb-a63e-47d7-8b6e-7ae95dae54dd {'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}
182fe2d6-7d28-4d11-987e-2aa880bd7a90 {'answer': 'wire', 'question': 'A metal that is ductile can be pulled into this while cold & under pressure', 'category': 'SCIENCE'}
268b6240-48db-4816-a330-f43f2033545b {'answer': 'the atmosphere', 'question': 'Changes in the tropospheric layer of this are what gives us weather', 'category': 'SCIENCE'}
5d991570-f737-4040-81e3-c129d5e8de88 {'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}


In [7]:
# Show data preview - with vectors
response = jeopardy.query.fetch_objects(
    limit=4, 
    include_vector=True
    # TODO: add include_vectors
)

for item in response.objects:
    print(item.properties)
    print(item.vector, '\n')

{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}
{'default': [0.08585705608129501, -0.00703099649399519, 0.026081981137394905, 0.04719817638397217, -0.05564465373754501, 0.07982385158538818, 0.014816142618656158, 0.01457249466329813, -0.011973578482866287, -0.010639313608407974, 0.02765989489853382, -0.024550477042794228, 0.01077273953706026, 0.004606114700436592, 0.0010463828220963478, -0.04993631690740585, 0.019190210849046707, 0.01842445880174637, 0.030305219814181328, 0.0497506819665432, 0.008980183862149715, 0.0018621698254719377, 0.022508470341563225, -0.00701359286904335, 0.044390417635440826, 0.012159215286374092, -0.08098408579826355, 0.0016214220086112618, 0.08478964120149612, -0.019712315872311592, 0.056294381618499756, -0.01949187181890011, 0.031140584498643875, 0.003208037232980132, -0.013899561017751694, -0.005685129202902317, 0.014514482580125332, 0.0044987937435507774, -0.00026703428011387587, 0.0185056757

### Super quick query example

In [8]:
# TODO: add near text query, search for African animals with limit 2
response = jeopardy.query.near_text(
    "Afrikansiche Tiere", 
    limit=2
)

for item in response.objects:
    print(item.properties)

{'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}
{'answer': 'Elephant', 'question': "It's the only living mammal in the order Proboseidea", 'category': 'ANIMALS'}


## A bit bigger example - 10k objects

### Load data

In [9]:
import json

with open("./wiki-10k.json") as file:
    data_10k = json.load(file)

print(json.dumps(data_10k[0:2], indent=2))

[
  {
    "text": "At this point in the siege, Lee's army had strengthened the Petersburg line. They dug breastworks out of rifle pits. At night, with pick and shovel, they then turned the breastworks into  deep trenches. Pointed stakes turned outwards were designed to break up any frontal attacks. The area between the two lines became a no man's land. The summer that year was hot and dry. Streams and springs were quickly drying up causing a water shortage on both sides. The siege was quickly becoming a stalemate.",
    "title": "Siege of Petersburg",
    "url": "https://simple.wikipedia.org/wiki/Siege%20of%20Petersburg",
    "wiki_id": "20231101.simple_550339_9"
  },
  {
    "text": "1944  Holocaust: Anne Frank and her family are placed on the last transport train from the Westerbork transit camp to Auschwitz.",
    "title": "September 3",
    "url": "https://simple.wikipedia.org/wiki/September%203",
    "wiki_id": "20231101.simple_8532_17"
  }
]


### Create a collection with Named Vectors and SourceProperties

In [10]:
from weaviate.classes.config import Configure, Property, DataType

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection here - with OpenAI vectorizer and define source properties
    client.collections.create(
        name="Wiki",

        vectorizer_config=[
            # NOTE: we are using NamedVectors here
            Configure.NamedVectors.text2vec_openai(
                name="main_vector",

                model="text-embedding-3-small",

                # TODO: set source properties to "title" and "text"
                source_properties=["title", "text"] # which properties should be used to generate a vector
            )
        ],

        # Example: how to define property schema (Optional)
        properties=[  
            Property(name="title", data_type=DataType.TEXT),
            Property(name="text", data_type=DataType.TEXT),
            Property(name="url", data_type=DataType.TEXT),
            Property(name="wiki_id", data_type=DataType.TEXT),
        ],
    )

create_wiki_collection()

### Import data - 10k objects with Batch

Batch speeds up the import process by grouping objects to be added in bigger batch groups.

Batch creates an internal buffer to collect objects to be added.<br>
Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.

Types of batch:
* `dynamic` - let batch calculate the optimal batch_size based on detected latency
* `fixed_size` - provide a fixed batch_size
* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit

### Take 1 – import sample 100

In [12]:
from tqdm import tqdm

sample_100 = data_10k[0:100]

wiki = client.collections.get("Wiki")

# TODO: setup dynamic batch
# loop through the sample_100 data
# add each object to the batch

with wiki.batch.fixed_size(100) as batch:
    for item in sample_100:
        print(item)
        batch.add_object(item)

print(f"Wiki count: {len(wiki)}")

{'text': "At this point in the siege, Lee's army had strengthened the Petersburg line. They dug breastworks out of rifle pits. At night, with pick and shovel, they then turned the breastworks into  deep trenches. Pointed stakes turned outwards were designed to break up any frontal attacks. The area between the two lines became a no man's land. The summer that year was hot and dry. Streams and springs were quickly drying up causing a water shortage on both sides. The siege was quickly becoming a stalemate.", 'title': 'Siege of Petersburg', 'url': 'https://simple.wikipedia.org/wiki/Siege%20of%20Petersburg', 'wiki_id': '20231101.simple_550339_9'}
{'text': '1944  Holocaust: Anne Frank and her family are placed on the last transport train from the Westerbork transit camp to Auschwitz.', 'title': 'September 3', 'url': 'https://simple.wikipedia.org/wiki/September%203', 'wiki_id': '20231101.simple_8532_17'}
{'text': 'A brownout is a drop of voltage in an electrical power supply. Brownouts may 

In [13]:
# check for errors
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


### Take 2 – import sample 100 – with UUID

To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property.

In [14]:
from weaviate.util import generate_uuid5

print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print("====================================")

print(generate_uuid5("This UUID is different"))
print(generate_uuid5("This UUID is different"))
print("====================================")

obj1 = { "title": "this is an object", "count": 1 }
obj2 = { "title": "this is an object", "count": 2 }
print(generate_uuid5(obj1))
print(generate_uuid5(obj2))


8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
09f975a6-0e62-565a-982e-e6ce148eac86
09f975a6-0e62-565a-982e-e6ce148eac86
c3c3ad32-fa65-5944-a021-415f8fda02af
4d0b77d3-4862-59bc-bf9f-9fe2b9bf89f0


In [18]:
# recreate the collection to start again
create_wiki_collection()

> Rerun the import script multiple times.

> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase.

In [19]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

sample_100 = data_10k[0:100]

wiki = client.collections.get("Wiki")

with wiki.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:
    for item in tqdm(sample_100):
        # TODO: generate an id from item["wiki_id"]
        id = generate_uuid5(item["wiki_id"])

        batch.add_object(
            item,
            # TODO: provide the new id here 
            uuid=id
        )

print(f"Wiki count: {len(wiki)}")

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 77.76it/s]


Wiki count: 100


### Take 2 - import the rest of the data - but break if multiple errors

In [20]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

wiki = client.collections.get("Wiki")

with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
    for item in tqdm(data_10k):
        id = generate_uuid5(item["wiki_id"])
        batch.add_object(item, uuid=id)

        # Check number of errors while running
        if(batch.number_errors > 10):
            print("Errors during batch import")
            break

100%|██████████| 10000/10000 [00:21<00:00, 472.59it/s] 


### Check for errors

In [21]:
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


## Bonus - iterate through all collection data

The client has a built-in function that allows you to iterate through all collection data.

In [22]:
wiki = client.collections.get("Wiki")

counter = 100

for item in wiki.iterator():
    print(item.properties)

    if (counter == 0): break
    
    counter -= 1

{'text': "On October 31, 2000 Stankonia was released. It entered the Billboard 200 at number two after selling over 530,000 copies in its first week. Outkast's first greatest hits album Big Boi and Dre Present... Outkast was released on December 4, 2001.", 'title': 'Outkast', 'wiki_id': '20231101.simple_431000_4', 'url': 'https://simple.wikipedia.org/wiki/Outkast'}
{'text': '3rd Class Cities - When a city incorporates, it becomes a 3rd class city. To incorporate, a city must generally have at least 300 people living there.', 'title': 'List of locations in Kansas', 'wiki_id': '20231101.simple_300080_2', 'url': 'https://simple.wikipedia.org/wiki/List%20of%20locations%20in%20Kansas'}
{'text': "Seventh Son (1987) is an alternate history and fantasy book written by Orson Scott Card. It is the first book in Card's The Tales of Alvin Maker series about Alvin Miller, the seventh son of a seventh son, who therefore has special powers. It was nominated for both the Hugo Award for Best Novel and 

You can also get `vector embeddings`, by using `include_vector`.

In [23]:
counter = 10

for item in wiki.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

    if (counter == 0): break
    
    counter -= 1

{'text': "On October 31, 2000 Stankonia was released. It entered the Billboard 200 at number two after selling over 530,000 copies in its first week. Outkast's first greatest hits album Big Boi and Dre Present... Outkast was released on December 4, 2001.", 'title': 'Outkast', 'wiki_id': '20231101.simple_431000_4', 'url': 'https://simple.wikipedia.org/wiki/Outkast'}
{'main_vector': [-0.012212410569190979, -0.03348527476191521, -0.017409181222319603, -0.04699688032269478, -0.05083797127008438, -0.04686130955815315, -0.001993978163227439, -0.012415762059390545, -0.03235554322600365, 0.023995522409677505, 0.07831306755542755, -0.00629825983196497, -0.04640941694378853, -0.0022848842199891806, -0.012076842598617077, 0.015884041786193848, -0.03219738230109215, -0.022131463512778282, -0.0599210225045681, -0.021001730114221573, 0.0007014934089966118, 0.0031801974400877953, -0.008715887553989887, 0.009964242577552795, -0.0339597649872303, 0.023408060893416405, -0.022820599377155304, 0.002500945

## Close the client

In [24]:
client.close()