In [1]:
import weaviate
from weaviate.auth import Auth
import weaviate.classes as wvc

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API key and URL from environment variables
WEAVIATE_URL = os.getenv('WEAVIATE_URL')
WEAVIATE_API_KEY = os.getenv('WEAVIATE_API_KEY')

if not WEAVIATE_URL or not WEAVIATE_API_KEY:
    raise ValueError("WEAVIATE_URL or WEAVIATE_API_KEY not found in .env file")

collection_name = "HolocaustTestimonies"

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

holocaust_testimonies = client.collections.create(
    name=collection_name,
    properties=[
        wvc.config.Property(
            name="sentence_ids",
            data_type=wvc.config.DataType.TEXT_ARRAY
        ),
        wvc.config.Property(
            name="text",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="category",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="populated_place",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="building",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="country",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="spatial_obj",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="dlf",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="int_space",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="env_features",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="region",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="npip",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="experience_group",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="birth_country",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="gender",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="rg",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="full_name",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="birth_year",
            data_type=wvc.config.DataType.INT
        ),
    ]
)

print(f"Collection '{collection_name}' created successfully.")

Collection 'HolocaustTestimonies' created successfully.


In [2]:
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np

# Get list of parquet files
parquet_files = glob.glob("../data/06_parquet/*.parquet")
parquet_files.sort()
parquet_files = parquet_files[:4]
print(f"Found {len(parquet_files)} parquet files.")

# Function to prepare a single row
def prepare_row(row):
    def safe_int(value):
        return int(value) if value is not None else 0

    return {
        "properties": {
            "sentence_ids": row['sentence_ids'].tolist() if isinstance(row['sentence_ids'], np.ndarray) else row['sentence_ids'],
            "text": row['text'],
            "category": row['category'],
            "populated_place": safe_int(row['populated_place']),
            "building": safe_int(row['building']),
            "country": safe_int(row['country']),
            "spatial_obj": safe_int(row['spatial_obj']),
            "dlf": safe_int(row['dlf']),
            "int_space": safe_int(row['int_space']),
            "env_features": safe_int(row['env_features']),
            "region": safe_int(row['region']),
            "npip": safe_int(row['npip']),
            "experience_group": row['experience_group'],
            "birth_country": row['birth_country'],
            "gender": row['gender'],
            "rg": row['rg'],
            "full_name": row['full_name'],
            "birth_year": safe_int(row['birth_year'])
        },
        "vector": row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding']
    }

data_rows = []
# Process each parquet file
for parquet_file in parquet_files:
    df = pd.read_parquet(parquet_file)
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing data"):
        data_rows.append(prepare_row(row))
    
# Perform batch insertion
with holocaust_testimonies.batch.fixed_size(batch_size=100) as batch:
    for data_row in tqdm(data_rows, desc="Inserting data"):
        batch.add_object(
            properties=data_row['properties'],
            vector=data_row['vector']
        )

print("Data insertion complete.")

Found 4 parquet files.


Preparing data: 100%|██████████| 1050/1050 [00:00<00:00, 24404.95it/s]
Preparing data: 100%|██████████| 1824/1824 [00:00<00:00, 24502.95it/s]
Preparing data: 100%|██████████| 1625/1625 [00:00<00:00, 20787.76it/s]
Preparing data: 100%|██████████| 808/808 [00:00<00:00, 25767.14it/s]
Inserting data: 100%|██████████| 5307/5307 [01:44<00:00, 50.76it/s]


Data insertion complete.


In [3]:
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import MetadataQuery

model = SentenceTransformer('sentence-transformers/LaBSE')

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [4]:
import weaviate
from weaviate.auth import Auth
import weaviate.classes as wvc

WEAVIATE_URL = "https://pnl6qiwaq5oijvge8h8owg.c0.us-east1.gcp.weaviate.cloud/"
WEAVIATE_API_KEY = "wNIf2XunX2THHTK6y1aDEr0lyj0FFv4x6KqT"
collection_name = "HolocaustTestimonies"

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

holocaust_testimonies = client.collections.get(name=collection_name)

UnexpectedStatusCodeError: Meta endpoint! Unexpected status code: 404, with response body: None.

In [None]:
def find_similar(query, threshold):
    query_vector = model.encode([query])[0]
    response = holocaust_testimonies.query.near_vector(
        near_vector=query_vector,
        limit=10,
        return_metadata=MetadataQuery(distance=True)
    )
    print(response)

    for o in response.objects:
        if o.metadata.distance < threshold:
            print(o.properties["text"])
            print(o.metadata.distance)

In [None]:
query = "We were not ther"
find_similar(query, threshold=.5)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('b973968a-1e5e-4742-89f7-5dd1f855e749'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.46541130542755127, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'birth_year': 1925, 'sentence_ids': ['223', '224', '225'], 'int_space': 0, 'env_features': 0, 'region': 0, 'full_name': 'fred none bachner', 'spatial_obj': 0, 'text': 'We had no medications.  We had nothing.  Nothing! ', 'npip': 0, 'country': 0, 'experience_group': 'survivor', 'populated_place': 0, 'dlf': 0, 'gender': 'm', 'category': 'answer', 'rg': 'rg-50.030.0012', 'birth_country': 'germany', 'building': 0}, references=None, vector={}, collection='HolocaustTestimonies'), Object(uuid=_WeaviateUUIDInt('00646fa8-79a3-4954-a95a-f7e3ed56ca10'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.513043224811554, certainty=None, score=None, explain_score=None, is_consistent=None, 

KeyError: 'book'