In [None]:
from tests.test_passage_encoding import get_passage_embedding_from_source 

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

weaviate_apikey = os.getenv("WEAVIATE_APIKEY")

In [None]:
import weaviate

client = weaviate.Client(
    "http://localhost:8080",
    auth_client_secret=weaviate.auth.AuthApiKey(weaviate_apikey),
)


In [None]:
client.get_meta()

In [None]:
from askem.utils import generate_api_key

generate_api_key()

In [None]:
test_passage = "Where an aircraft passes through a cloud, it can disperse the cloud in its path."

In [None]:
get_passage_embedding_from_source(test_passage)

In [None]:
test_y = test_passage_encoding()

In [None]:
type(test_y)

In [None]:
test_y

In [None]:
len(test_y.detach().numpy()[0])

In [None]:
from pathlib import Path

input_dir = "/askem/data/debug_data"
files = Path(input_dir).glob("**/*.txt")


In [None]:
import weaviate

client = weaviate.Client(
    "http://localhost:8080",
    auth_client_secret=weaviate.auth.AuthApiKey(weaviate_apikey),
)


In [None]:
client.vectorizer.create("")

In [None]:
client.get_meta()


### Preprocessing

In [None]:
from haystack import Pipeline
from haystack.nodes import PreProcessor, TextConverter

text_converter = TextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=False,
    split_overlap=5,
)

pipeline = Pipeline()
pipeline.add_node(text_converter, name="text_converter", inputs=["File"])
pipeline.add_node(preprocessor, name="preprocessor", inputs=["text_converter"])


#### Run preprocessing

In [None]:
input_dir = "/askem/data/debug_data"
files = Path(input_dir).glob("**/*.txt")
docs = pipeline.run(file_paths=[str(file) for file in files])

### Upload data to weaviate

In [None]:
client.schema.delete_all()

In [None]:
import json

# Create passage schema
passage_schema = {
    "class": "Passage",
    "description": "Paragraph chunk of a document",
    "vectorizer": "text2vec-transformers",
    "moduleConfig": {
        "text2vec-transformers": {"vectorizeClassName": False}
    },
    "vectorIndexConfig": {
        "distance": "dot"
    },  # DPR is designed for dot-product similarity
    "properties": [
        {"name": "content", "dataType": ["text"]},
        {"name": "paper_id", "dataType": ["text"], "moduleConfig": {"text2vec-transformers": {"skip": True}}},
    ],
}

# Create class in Weaviate
client.schema.create_class(passage_schema)

# Dump full schema to file
with open("askem/schema/passage.json", "w") as f:
    json.dump(client.schema.get("passage"), f, indent=2)

## Load data into Weaviate

In [None]:
data_obj = {
    "content": "Where an aircraft passes through a cloud, it can disperse the cloud in its path.",
    "paper_id": "Title of the document",
}

with client.batch as batch:
    batch.batch_size = 1
    batch.dynamic = True
    batch.add_data_object(data_object=data_obj, class_name="passage")

In [None]:
client.query.aggregate("passage").with_meta_count().do()

In [None]:
y = (
    client.query.get("passage", ["content"])
    .with_near_text({"concepts": ["animal"]})
    .with_additional(["vector", "distance", "id"])
    .do()
)

In [None]:
y

In [None]:
len(y["data"]["Get"]["Passage"][0]["_additional"]["vector"])

In [None]:
text = "I am not sure about this."

with client.batch as batch:
    batch.batch_size = 50
    batch.dynamic = True

    batch.add_data_object({"passage": text}, class_name="passage")


In [None]:
client.query.get("passage", ["passage", "vector"]).do()


In [None]:
client.schema.delete_all()

In [None]:
client.schema.get("passage")


In [None]:
client.query.aggregate("Passage").with_meta_count().do()


In [None]:
client.schema.get()["classes"]


In [None]:
client.get_meta()["modules"]["text2vec-transformers"].keys()