In [2]:
import glob
import yaml
import spacy
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import weaviate
from collections import defaultdict
import json
import re
from bs4 import BeautifulSoup

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('sentence-transformers/LaBSE')

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [4]:
files = glob.glob("../data/05_final/*.html")
files.sort()

In [5]:
def process_testimony(data):
    soup = BeautifulSoup(data, 'html.parser')
    metadata_include = ["rg_number", "interviewee", "gender", "birth_year", "experience_group", "birth_country"]

    def get_metadata(soup):
        # Extract the YAML front matter from the HTML
        front_matter_text = soup.text.split('---\n', 2)[1]
        metadata = yaml.safe_load('---\n' + front_matter_text)

        # Only keep metadata found in metadata_include
        filtered_metadata = {k: metadata[k] for k in metadata_include if k in metadata}
        filtered_metadata["birth_country"] = metadata.get("country", "")
        return filtered_metadata

    def extract_windows(dialogue_tag, metadata):
        p_tag = dialogue_tag.find('p')
        sentences = p_tag.find_all("sentence")
        p_text = " ".join([sent.text for sent in sentences])
        windows = []
        window_size = 3
        sentence_ids = [sent['id'] for sent in sentences]
        
        if len(sentences) < window_size:
            window_texts = " ".join([sent.text for sent in sentences])
            window = {
                'sentence_ids': sentence_ids,
                'text': window_texts,
                'labels': count_labels(sentences)
            }
            window["category"] = "question" if dialogue_tag.get('class') == ['Question'] else "answer"
            window.update(metadata)
            windows.append(window)
        else:
            for i in range(len(sentences) - window_size + 1):
                window_sentences = sentences[i:i+window_size]
                window_texts = " ".join([sent.text for sent in window_sentences])
                window = {
                    'sentence_ids': sentence_ids[i:i+window_size],
                    'text': window_texts,
                    'labels': count_labels(window_sentences)
                }
                window["category"] = "question" if dialogue_tag.get('class') == ['Question'] else "answer"
                window.update(metadata)
                windows.append(window)

        return windows

    def count_labels(sentences):
        label_counters = {
            'populated_place': 0, 'building': 0, 'country': 0, 'spatial_obj': 0, 
            'dlf': 0, 'int_space': 0, 'env_features': 0, 'region': 0, 
            'npip': 0, "country": 0,
        }
        for sentence in sentences:
            for label in label_counters:
                label_counters[label] += len(sentence.find_all("span", {"class": label.lower()}))
        
        return label_counters

    metadata = get_metadata(soup)
    all_windows = []
    for dialogue_tag in soup.find_all('dialogue'):
        windows = extract_windows(dialogue_tag, metadata)
        all_windows.extend(windows)

    sentence_embeddings = model.encode([text["text"] for text in all_windows])

    combined_data = []

    for i, window in enumerate(all_windows):
        combined_dict = {
            "sentence_ids": window['sentence_ids'],
            "text": window['text'],
            "embedding": sentence_embeddings[i],
            "category": window["category"]
        }
        combined_dict.update(window['labels'])
        
        label_map = {"rg_number": "rg", "interviewee": "full_name"}
        for label in metadata_include:
            new_label = label_map.get(label, label)
            combined_dict[new_label] = window[label]
        combined_data.append(combined_dict)
    return combined_data

In [13]:
import os
from tqdm import tqdm
import pandas as pd

for file in tqdm(files):
    output_file = file.split("/")[-1].replace("_cleaned.html", ".parquet")
    output_path = f"../data/06_parquet/{output_file}"
    
    # Check if the output file already exists
    if os.path.exists(output_path):
        continue  # Skip this file and move to the next one
    
    with open(file, "r", encoding="utf-8") as f:
        data = f.read()
        data = data.replace("|", "I")
        result = process_testimony(data)
        df = pd.DataFrame(result)
        df['birth_year'] = df['birth_year'].replace('none', pd.NA)
        df.to_parquet(output_path)

100%|██████████| 979/979 [1:16:39<00:00,  4.70s/it]


In [34]:
import weaviate
from weaviate.auth import Auth
import weaviate.classes as wvc

WEAVIATE_URL = "https://1tnwq0iequmnhcrcultoq.c0.us-east1.gcp.weaviate.cloud"
WEAVIATE_API_KEY = "pV6haMawVUuFMWiu8RGRkbrpjo4YfGehXHmV"
collection_name = "HolocaustTestimonies"

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

holocaust_testimonies = client.collections.create(
    name=collection_name,
    properties=[
        wvc.config.Property(
            name="sentence_ids",
            data_type=wvc.config.DataType.TEXT_ARRAY
        ),
        wvc.config.Property(
            name="text",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="category",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="populated_place",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="building",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="country",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="spatial_obj",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="dlf",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="int_space",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="env_features",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="region",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="npip",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="experience_group",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="birth_country",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="gender",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="rg",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="full_name",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="birth_year",
            data_type=wvc.config.DataType.INT
        ),
    ]
)

print(f"Collection '{collection_name}' created successfully.")

Collection 'HolocaustTestimonies' created successfully.


In [64]:
print(client.is_live())

True


In [63]:
# Get list of parquet files
parquet_files = glob.glob("../data/06_parquet/*.parquet")
parquet_files.sort()
print(f"Found {len(parquet_files)} parquet files.")

# Function to prepare a single row
def prepare_row(row):
    return {
        "properties": {
            "sentence_ids": row['sentence_ids'],
            "text": row['text'],
            "category": row['category'],
            "populated_place": int(row['populated_place']),
            "building": int(row['building']),
            "country": int(row['country']),
            "spatial_obj": int(row['spatial_obj']),
            "dlf": int(row['dlf']),
            "int_space": int(row['int_space']),
            "env_features": int(row['env_features']),
            "region": int(row['region']),
            "npip": int(row['npip']),
            "experience_group": row['experience_group'],
            "birth_country": row['birth_country'],
            "gender": row['gender'],
            "rg": row['rg'],
            "full_name": row['full_name'],
            "birth_year": int(row['birth_year'])
        },
        "vector": row['embedding']
    }

# Process each parquet file
for parquet_file in parquet_files[:1]:
    df = pd.read_parquet(parquet_file)
    
    # Prepare all the data rows
    data_rows = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing data"):
        data_rows.append(prepare_row(row))
    
    # Perform batch insertion
    with holocaust_testimonies.batch.fixed_size(batch_size=100) as batch:
        for data_row in tqdm(data_rows, desc="Inserting data"):
            batch.add_object(
                properties=data_row['properties'],
                vector=data_row['vector']
            )

print("Data insertion complete.")

Found 979 parquet files.


Preparing data:   0%|          | 0/1050 [00:00<?, ?it/s]

Preparing data: 100%|██████████| 1050/1050 [00:00<00:00, 30601.32it/s]
Inserting data: 100%|██████████| 1050/1050 [00:00<00:00, 8973.22it/s]


Data insertion complete.


In [55]:
(len(data_rows))

1050

In [61]:
holocaust_testimonies = client.collections.get("HolocaustTestimonies")

In [56]:
from weaviate.classes.query import MetadataQuery


In [57]:
def find_similar(query, threshold):
    query_vector = model.encode([query])[0]
    response = holocaust_testimonies.query.near_vector(
        near_vector=query_vector,
        limit=10,
        return_metadata=MetadataQuery(distance=True)
    )
    print(response)

    for o in response.objects:
        if o.metadata.distance < threshold:
            print(o.properties["book"], o.properties["chapter"], o.properties["verse"], ": ", o.properties["text"])
            print(o.metadata.distance)

In [59]:
query = "We were not ther"
find_similar(query, threshold=.5)

QueryReturn(objects=[])
