In [None]:
!pip install weaviate-client

In [1]:
# load creds from .env file
import os
with open('.env', 'r') as env_file:
    for line in env_file:
        key, value = line.strip().split('=')
        os.environ[key] = value

In [2]:
import os
import uuid

import weaviate
from weaviate.embedded import EmbeddedOptions
from weaviate.util import get_valid_uuid

from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json

COHERE_KEY = os.environ.get("COHERE_API_KEY")
OPENAI_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
# fetch our partitioned documents

def get_result_files(folder_path):
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

files = get_result_files('results/')
files

['results/1ZBXs78r2SSgYUZI_ZOwZm_yt7A4n6NL8-FOMC20170201tealbookb20170126.pdf.json',
 'results/1ppcP-AOkTB5ln7CqnwQy9l_Kvtftn4L0-NAE-report-on-quantum-computing.pdf.json',
 'results/1ps9S0TaMC02NahJAfp3NyZo6NVmf7rpM-COVID-19 Accelerates Quantum Computing Adoption, Driving Robust Growth in North America, Europe, and Asia-Pacific.pdf.json',
 "results/1TwH2WPshvwueAHlDiMfe9STLtFgOx-uB-NASA's SpaceX Crew-7 Launches to International Space Station.pdf.json",
 'results/1AahuJfN_mmiqqP5E9byMj3cQBZJCanqmZW-YDSaCPC4-Quantum Photonics Market.docx.json',
 'results/180MZG3uAnSsSUbWtTqm2Gmv4GYpe2dnY-honeywell-quantum-encryption.docx.json',
 'results/EmailMessage/02sHn00001EwshAIAR.json',
 'results/EmailMessage/02sHn00001EwshBIAR.json',
 'results/EmailMessage/02sHn00001EwshFIAR.json',
 'results/EmailMessage/02sHn00001EwshtIAB.json',
 'results/Lead/00QHn000021npFsMAI.json',
 'results/Lead/00QHn000021npG6MAI.json',
 'results/Lead/00QHn000021npFzMAI.json',
 'results/Lead/00QHn000021npFoMAI.json',
 'resu

In [4]:
# Set up a weaviate client with document schema

def create_local_weaviate_client():
    return weaviate.Client(
        embedded_options=EmbeddedOptions(
            additional_env_vars={
                "ENABLE_MODULES": "text2vec-openai,reranker-cohere"
            },
        ),
        additional_headers={
            "X-OpenAI-Api-Key": OPENAI_KEY,
            "X-Cohere-API-Key": COHERE_KEY
        },
    )

def get_schema():
    vectorizer = "text2vec-openai"
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "moduleConfig": {"reranker-cohere": {"model": "rerank-multilingual-v2.0"}},
                "properties": [
                    {
                        "name": "last_modified",
                        "dataType": ["text"],
                        "description": "Last modified date for the document",
                        "moduleConfig": {
                            vectorizer: {
                                "skip": True,  # not including the this property for vectorization
                            },
                        },
                    },
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

client = create_local_weaviate_client()
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

Started /Users/ryannikolaidis/.cache/weaviate-embedded: process ID 54012


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-09-08T00:37:52-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-09-08T00:37:52-07:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50051","time":"2023-09-08T00:37:52-07:00"}
{"action":"hnsw_vector_cache_prefill","count":5000,"index_id":"doc_ms1uMSeBBujF","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-09-08T00:37:52-07:00","took":11271709}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:6666","time":"2023-09-08T00:37:52-07:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"doc_3f58oVecGI4K","level":"info","limit":1000000000000,"msg":"pref

In [5]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            # note: data_source attribute is assigned as a dict and this breaks chunking
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            # note: coordinates attribute maps each element to an individual chunk,
            # this breaks chunking logically (outputs lots of small chunks)
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        chunks[i] = {"last_modified": chunks[i].metadata.last_modified, "text": chunks[i].text}

    return chunks


def add_data_to_weaviate(files, weaviate, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            weaviate.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
            )
        
    weaviate.batch.flush()

add_data_to_weaviate(
    files=files,
    weaviate=client,
    chunk_under_n_chars=75,
    chunk_new_after_n_chars=200
)

Uploading 295 chunks for results/1ZBXs78r2SSgYUZI_ZOwZm_yt7A4n6NL8-FOMC20170201tealbookb20170126.pdf.json.
Uploading 58 chunks for results/1ppcP-AOkTB5ln7CqnwQy9l_Kvtftn4L0-NAE-report-on-quantum-computing.pdf.json.
Uploading 31 chunks for results/1ps9S0TaMC02NahJAfp3NyZo6NVmf7rpM-COVID-19 Accelerates Quantum Computing Adoption, Driving Robust Growth in North America, Europe, and Asia-Pacific.pdf.json.
Uploading 38 chunks for results/1TwH2WPshvwueAHlDiMfe9STLtFgOx-uB-NASA's SpaceX Crew-7 Launches to International Space Station.pdf.json.
Uploading 20 chunks for results/1AahuJfN_mmiqqP5E9byMj3cQBZJCanqmZW-YDSaCPC4-Quantum Photonics Market.docx.json.
Uploading 11 chunks for results/180MZG3uAnSsSUbWtTqm2Gmv4GYpe2dnY-honeywell-quantum-encryption.docx.json.
Uploading 7 chunks for results/EmailMessage/02sHn00001EwshAIAR.json.
Uploading 7 chunks for results/EmailMessage/02sHn00001EwshBIAR.json.
Uploading 6 chunks for results/EmailMessage/02sHn00001EwshFIAR.json.
Uploading 7 chunks for results/E

In [6]:
# Query weaviate

def query_weaviate(client, query, limit: int = 10):
    return (
        client.query
        .get('Doc', ['text', 'last_modified'])
        .with_near_text({
            'concepts': [query]
        })
        .with_additional("distance")
        .with_limit(limit)
        .do()
    )

def print_query_and_results(query, results):
    print("\n\n\n-------------------------")
    print(f"query: {query}")
    print("-------------------------")
    for index, result in enumerate(results["data"]["Get"]["Doc"]):
        print(f"\n\n-- RESULT {index}:\n")
        print(result["text"])
        # print(result["text"].replace("\\n", "\n"), end="")

queries = [
    "Explore the significance of the Federal Reserve's inflation target and how it is explained.",
    "Look for information regarding the factors considered by the FOMC when it comes to maximum employment.",
    "Find mentions of commercial interest in post-quantum cryptography.",
    "Investigate any discussions or explanations regarding the significance and function of Shor's algorithm in quantum computing",

]

for query in queries:
    results = query_weaviate(client, query, limit=5)
    print_query_and_results(query, results)







-------------------------
query: Explore the significance of the Federal Reserve's inflation target and how it is explained.
-------------------------


-- RESULT 0:

One notable aspect is the FOMC's inflation target of 2 percent over the longer run. This symmetric inflation goal is communicated clearly to anchor longer-term inflation expectations, which can, in turn, influence price stability and long-term interest rates. Understanding how the Federal Reserve's inflation target aligns with our investment strategies could be crucial in navigating the financial markets.


-- RESULT 1:

It's essential that we stay informed about the Federal Reserve's monetary policy and how it might affect our investment strategies. I suggest we schedule a meeting to delve deeper into this topic and explore ways to adjust our investment approach accordingly.


-- RESULT 2:

I believe that staying informed about the Federal Reserve's monetary policy strategy and its implications for our investments is 