In [13]:
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from astrapy import DataAPIClient
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()


True

In [2]:
# Get API key and URL from environment variables
ASTRA_TOKEN = os.getenv('ASTRA_TOKEN')
ASTRA_API_ENDPOINT = os.getenv('ASTRA_API_ENDPOINT')

In [3]:
# connect to a database
client = DataAPIClient(ASTRA_TOKEN)

In [4]:
database = client.get_database(ASTRA_API_ENDPOINT)

In [5]:
print(f"ASTRA_TOKEN: {ASTRA_TOKEN}")
print(f"ASTRA_API_ENDPOINT: {ASTRA_API_ENDPOINT}")

ASTRA_TOKEN: AstraCS:EtDeDieMsPARStKtcnZOFOdY:3f802e2e07da5be014ce2a3d26dcc8693c8acde7dd35dbe1df384624f83397d3
ASTRA_API_ENDPOINT: https://2534174f-90b3-4e11-afb5-8ee29c3c090d-us-east-2.apps.astra.datastax.com


In [6]:
print(f"Connected to Astra DB: {database.list_collection_names()}")

Connected to Astra DB: []


In [19]:
collection_name = "HolocaustTestimonies"


# Create or get the collection
collection = database.create_collection(collection_name, dimension=768)

# Define the schema
schema = {
    "sentence_ids": "list<text>",
    "text": "text",
    "category": "text",
    "populated_place": "int",
    "building": "int",
    "country": "int",
    "spatial_obj": "int",
    "dlf": "int",
    "int_space": "int",
    "env_features": "int",
    "region": "int",
    "npip": "int",
    "experience_group": "text",
    "birth_country": "text",
    "gender": "text",
    "rg": "text",
    "full_name": "text",
    "birth_year": "int"
}

print(f"Collection '{collection_name}' created successfully.")

Collection 'HolocaustTestimonies' created successfully.


In [26]:
# Get list of parquet files
parquet_files = glob.glob("../data/06_parquet/*.parquet")
parquet_files.sort()
parquet_files = parquet_files[:4]
print(f"Found {len(parquet_files)} parquet files.")

# Function to prepare a single row
def prepare_row(row):
    def safe_int(value):
        return int(value) if value is not None else 0

    return {
        "sentence_ids": row['sentence_ids'].tolist() if isinstance(row['sentence_ids'], np.ndarray) else row['sentence_ids'],
        "text": row['text'],
        "category": row['category'],
        "populated_place": safe_int(row['populated_place']),
        "building": safe_int(row['building']),
        "country": safe_int(row['country']),
        "spatial_obj": safe_int(row['spatial_obj']),
        "dlf": safe_int(row['dlf']),
        "int_space": safe_int(row['int_space']),
        "env_features": safe_int(row['env_features']),
        "region": safe_int(row['region']),
        "npip": safe_int(row['npip']),
        "experience_group": row['experience_group'],
        "birth_country": row['birth_country'],
        "gender": row['gender'],
        "rg": row['rg'],
        "full_name": row['full_name'],
        "birth_year": safe_int(row['birth_year']),
        "$vector": row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding']
    }

# Process each parquet file
for parquet_file in parquet_files:
    df = pd.read_parquet(parquet_file)
    
    data_rows = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing data"):
        data_rows.append(prepare_row(row))
    
    # Perform batch insertion
    for i in tqdm(range(0, len(data_rows), 100), desc="Inserting data"):
        batch = data_rows[i:i+100]
        collection.insert_many(documents=batch)

print("Data insertion complete.")

Found 4 parquet files.


Preparing data: 100%|██████████| 1050/1050 [00:00<00:00, 24661.32it/s]
Inserting data: 100%|██████████| 11/11 [00:09<00:00,  1.16it/s]
Preparing data: 100%|██████████| 1824/1824 [00:00<00:00, 24153.75it/s]
Inserting data: 100%|██████████| 19/19 [00:16<00:00,  1.13it/s]
Preparing data: 100%|██████████| 1625/1625 [00:00<00:00, 22100.05it/s]
Inserting data: 100%|██████████| 17/17 [00:14<00:00,  1.14it/s]
Preparing data: 100%|██████████| 808/808 [00:00<00:00, 24856.23it/s]
Inserting data: 100%|██████████| 9/9 [00:07<00:00,  1.22it/s]

Data insertion complete.





In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/LaBSE')

  from tqdm.autonotebook import tqdm, trange
  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [27]:
# Query the collection with the phrase "Hunger"
query_text = "Hunger"

# Encode the query text
query_embedding = model.encode(query_text)

# Perform the vector search
search_results = collection.find(
    filter={},
    sort={"$vector": query_embedding},
    limit=5
)

print(f"Top 5 results for the query '{query_text}':")
for i, doc in enumerate(search_results, 1):
    print(f"{i}. Text: {doc['text']}")
    print(f"   Category: {doc['category']}")
    print(f"   Experience Group: {doc['experience_group']}")
    print()



Top 5 results for the query 'Hunger':
1. Text: A: I very often was very hungry, but what I remember most is thirst, always being thirsty and never having enough to drink.  I think at one point I sort of lost my appetite for food except when you know you have a grumbling stomach, but I didn't know really.  I was scared. 
   Category: answer
   Experience Group: survivor

2. Text: A: Well, there was starvation, and the worst thing there was no water.  We were hiding in basements.  It was really mayhem. 
   Category: answer
   Experience Group: survivor

3. Text: A: Well, there was starvation, and the worst thing there was no water.  We were hiding in basements.  It was really mayhem. 
   Category: answer
   Experience Group: survivor

4. Text: I think at one point I sort of lost my appetite for food except when you know you have a grumbling stomach, but I didn't know really.  I was scared.  I was always full of fear and hunger doesn't always assert itself when you're full of fear. 
   Ca

In [23]:
doc

{'_id': 'ef2dacf0-f88e-420f-adac-f0f88ee20f4a',
 'sentence_ids': ['2339', '2340'],
 'text': "Q: You're tired aren't you.  Do you want to stop? ",
 'category': 'question',
 'populated_place': 0,
 'building': 0,
 'country': 0,
 'spatial_obj': 0,
 'dlf': 0,
 'int_space': 0,
 'env_features': 0,
 'region': 0,
 'npip': 0,
 'experience_group': 'survivor',
 'birth_country': 'poland',
 'gender': 'm',
 'rg': 'rg-50.030.0001',
 'full_name': 'david a. kochalski',
 'birth_year': 1928}

In [32]:
# Query the collection with the phrase "Hunger" and additional filters
query_text = "Wir hatten keine essen"

# Encode the query text
query_embedding = model.encode(query_text)

# Perform the vector search with filters
search_results = collection.find(
    filter={
        # "building": {"$gt": 0},
        # "birth_country": "poland"
    },
    sort={"$vector": query_embedding},
    limit=5
)

print(f"Top 5 results for the query '{query_text}' with filters:")
for i, doc in enumerate(search_results, 1):
    print(f"{i}. Text: {doc['text']}")
    print(f"   Category: {doc['category']}")
    print(f"   Experience Group: {doc['experience_group']}")
    print(f"   Populated Place: {doc['populated_place']}")
    print(f"   Birth Country: {doc['birth_country']}")
    print()


Top 5 results for the query 'Wir hatten keine essen' with filters:
1. Text: I don't even know if there was anything to drink.  We all had something to eat, I mean whatever we grabbed.  As I said, we had an extra bread which we never allowed ourselves to eat the whole bread. 
   Category: answer
   Experience Group: survivor
   Populated Place: 0
   Birth Country: poland

2. Text: Food, they took away.  So we went three days and we didn't eat because there was no food in Germany no where.  We went through Nuremberg. 
   Category: answer
   Experience Group: survivor
   Populated Place: 0
   Birth Country: hungary

3. Text: So at least I had money.  Food, they took away.  So we went three days and we didn't eat because there was no food in Germany no where. 
   Category: answer
   Experience Group: survivor
   Populated Place: 0
   Birth Country: hungary

4. Text: We used to - - we didn't have any horses or anything, we used to have a little buggy, long buggy, one on the side and I on ot