In [None]:
!pip install weaviate-client

In [1]:
# load creds from .env file
import os
with open('.env', 'r') as env_file:
    for line in env_file:
        key, value = line.strip().split('=')
        os.environ[key] = value

In [2]:
import os
import uuid

import weaviate
from weaviate.embedded import EmbeddedOptions
from weaviate.util import get_valid_uuid

from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json

OPENAI_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
# fetch our partitioned documents

def get_result_files(folder_path):
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

files = get_result_files('results/')
files

['results/1ZBXs78r2SSgYUZI_ZOwZm_yt7A4n6NL8-FOMC20170201tealbookb20170126.pdf.json',
 'results/1t2IWeYgHeq8IKUeBLgM9DQ_nwLCXq8Ch-FOMC_LongerRunGoals_201701.pdf.json',
 "results/1TwH2WPshvwueAHlDiMfe9STLtFgOx-uB-NASA's SpaceX Crew-7 Launches to International Space Station.pdf.json",
 'results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1ppcP-AOkTB5ln7CqnwQy9l_Kvtftn4L0-NAE-report-on-quantum-computing.pdf.json',
 'results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1ps9S0TaMC02NahJAfp3NyZo6NVmf7rpM-COVID-19 Accelerates Quantum Computing Adoption, Driving Robust Growth in North America, Europe, and Asia-Pacific.pdf.json',
 'results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1AahuJfN_mmiqqP5E9byMj3cQBZJCanqmZW-YDSaCPC4-Quantum Photonics Market.docx.json',
 'results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/180MZG3uAnSsSUbWtTqm2Gmv4GYpe2dnY-honeywell-quantum-encryption.docx.json']

In [4]:
# set up a weaviate client with document schema

def create_local_weaviate_client():
    return weaviate.Client(
        embedded_options=EmbeddedOptions(
            additional_env_vars={
                "ENABLE_MODULES": "text2vec-openai"
            },
        ),
        additional_headers={
            "X-OpenAI-Api-Key": OPENAI_KEY
        },
    )

def get_schema():
    vectorizer = "text2vec-openai"
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "moduleConfig": {"reranker-cohere": {"model": "rerank-multilingual-v2.0"}},
                "properties": [
                    {
                        "name": "last_modified",
                        "dataType": ["text"],
                        "description": "Last modified date for the document",
                        "moduleConfig": {
                            vectorizer: {
                                "skip": True,  # not including the this property for vectorization
                            },
                        },
                    },
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

client = create_local_weaviate_client()
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

Started /Users/ryannikolaidis/.cache/weaviate-embedded: process ID 81456


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-09-15T09:55:21-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-09-15T09:55:21-07:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50051","time":"2023-09-15T09:55:21-07:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"doc_x0l4Hg7GaXLV","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-09-15T09:55:21-07:00","took":14541875}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:6666","time":"2023-09-15T09:55:21-07:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"doc_XQDATrLw1byO","level":"info","limit":1000000000000,"msg":"pref

In [5]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        chunks[i] = {"last_modified": chunks[i].metadata.last_modified, "text": chunks[i].text}

    return chunks


def add_data_to_weaviate(files, weaviate, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            weaviate.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
            )
        
    weaviate.batch.flush()

add_data_to_weaviate(
    files=files,
    weaviate=client,
    chunk_under_n_chars=75,
    chunk_new_after_n_chars=200
)

Uploading 295 chunks for results/1ZBXs78r2SSgYUZI_ZOwZm_yt7A4n6NL8-FOMC20170201tealbookb20170126.pdf.json.
Uploading 6 chunks for results/1t2IWeYgHeq8IKUeBLgM9DQ_nwLCXq8Ch-FOMC_LongerRunGoals_201701.pdf.json.
Uploading 38 chunks for results/1TwH2WPshvwueAHlDiMfe9STLtFgOx-uB-NASA's SpaceX Crew-7 Launches to International Space Station.pdf.json.
Uploading 58 chunks for results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1ppcP-AOkTB5ln7CqnwQy9l_Kvtftn4L0-NAE-report-on-quantum-computing.pdf.json.
Uploading 31 chunks for results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1ps9S0TaMC02NahJAfp3NyZo6NVmf7rpM-COVID-19 Accelerates Quantum Computing Adoption, Driving Robust Growth in North America, Europe, and Asia-Pacific.pdf.json.
Uploading 20 chunks for results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/1AahuJfN_mmiqqP5E9byMj3cQBZJCanqmZW-YDSaCPC4-Quantum Photonics Market.docx.json.
Uploading 11 chunks for results/1hXnbJlGkJ86pJsbKVWhjkLRr4X17FK-J-Quantum Computing/180

In [7]:
# query weaviate

def query_weaviate(client, query, limit: int = 10):
    return (
        client.query
        .get('Doc', ['text', 'last_modified'])
        .with_near_text({
            'concepts': [query]
        })
        .with_additional("distance")
        .with_limit(limit)
        .do()
    )

def print_query_and_results(query, results):
    print("\n\n\n-------------------------")
    print(f"QUERY: {query}")
    print("-------------------------")
    for index, result in enumerate(results["data"]["Get"]["Doc"]):
        print(f"\n\n-- RESULT {index}:\n")
        print(result["text"])

queries = [
    "Explore the significance of the Federal Reserve's inflation target and how it is explained.",
    "Find mentions of commercial interest in post-quantum cryptography.",

]

for query in queries:
    results = query_weaviate(client, query, limit=5)
    print_query_and_results(query, results)







-------------------------
QUERY: Explore the significance of the Federal Reserve's inflation target and how it is explained.
-------------------------


-- RESULT 0:

The inflation rate over the longer run is primarily determined by monetary policy, and hence the Committee has the ability to specify a longer-run goal for inflation. The Commit- tee reaffirms its judgment that inflation at the rate of 2 percent, as measured by the annual change in the price index for personal con- sumption expenditures, is most consistent over the longer run with the Federal Reserve’s statutory mandate. The Committee would be concerned if inflation were running persistent- ly above or below this objective. Communi- cating this symmetric inflation goal clearly to the public helps keep longer-term inflation expectations firmly anchored, thereby foster- ing price stability and moderate long-term interest rates and enhancing the Committee’s


-- RESULT 1:

4. In determining the timing and size of future a