In [None]:
!pip install weaviate-client

In [4]:
# load creds from .env file (which needs to export OPENAI_API_KEY)
import os
with open('.env', 'r') as env_file:
    for line in env_file:
        key, value = line.strip().split('=')
        os.environ[key] = value

OPENAI_KEY = os.environ.get("OPENAI_API_KEY")

In [5]:
# fetch our partitioned documents

import os

def get_result_files(folder_path):
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

files = get_result_files('results/')
files

[]

In [None]:
# set up a weaviate client with document schema

import uuid

import weaviate
from weaviate.embedded import EmbeddedOptions
from weaviate.util import get_valid_uuid

def create_local_weaviate_client():
    return weaviate.Client(
        embedded_options=EmbeddedOptions(
            additional_env_vars={
                "ENABLE_MODULES": "text2vec-openai"
            },
        ),
        additional_headers={
            "X-OpenAI-Api-Key": OPENAI_KEY
        },
    )

def get_schema():
    vectorizer = "text2vec-openai"
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "moduleConfig": {"reranker-cohere": {"model": "rerank-multilingual-v2.0"}},
                "properties": [
                    {
                        "name": "last_modified",
                        "dataType": ["text"],
                        "description": "Last modified date for the document",
                        "moduleConfig": {
                            vectorizer: {
                                "skip": True,  # not including the this property for vectorization
                            },
                        },
                    },
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

client = create_local_weaviate_client()
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

In [None]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json

def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        chunks[i] = {"last_modified": chunks[i].metadata.last_modified, "text": chunks[i].text}

    return chunks


def add_data_to_weaviate(files, weaviate, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            weaviate.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
            )
        
    weaviate.batch.flush()

add_data_to_weaviate(
    files=files,
    weaviate=client,
    chunk_under_n_chars=75,
    chunk_new_after_n_chars=200
)

In [None]:
# query weaviate

def query_weaviate(client, query, limit: int = 10):
    return (
        client.query
        .get('Doc', ['text', 'last_modified'])
        .with_near_text({
            'concepts': [query]
        })
        .with_additional("distance")
        .with_limit(limit)
        .do()
    )

def print_query_and_results(query, results):
    print("\n\n\n-------------------------")
    print(f"QUERY: {query}")
    print("-------------------------")
    for index, result in enumerate(results["data"]["Get"]["Doc"]):
        print(f"\n\n-- RESULT {index}:\n")
        print(result["text"])

queries = [
    "Explore the significance of the Federal Reserve's inflation target and how it is explained.",
    "Find mentions of commercial interest in post-quantum cryptography.",

]

for query in queries:
    results = query_weaviate(client, query, limit=5)
    print_query_and_results(query, results)


