In [50]:
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor, LLM

In [51]:
QA = [
["Given the objects dataset, how many types of objects can be found, according to their volume?",
"Load dataset, calculate volume, cluster, count cluster"],
["Train a model that can estimate density based on surface and mass given the objects dataset.",
"Load dataset, calculate density, split dataset, perform regression density using surface and mass, calculate MSE"],

["Calculate the regression of mass using height in the objects dataset, and test the model.",
"Load dataset, split dataset, perform regression mass using height, calculate MSE"],

["Train a model to estimate density using surface and mass data from the object dataset.",
"Load dataset, calculate density, split dataset, perform regression density using surface and mass, calculate MSE"],

["Determine the variety of objects in the dataset based on their volume.",
"Load dataset, calculate volume, cluster, count clusters"],

["Calculate the regression of mass using height within the dataset of objects.",
"Load dataset, split dataset, perform regression mass using height, calculate MSE"],

["How many clusters can be identified within the dataset based on height?",
"Load dataset, cluster using height, count clusters"],

["Calculate the density of objects using their volume and mass from the dataset.",
"Load dataset, calculate density"],

["Identify different types of objects based on volume and surface area.",
"Load dataset, calculate volume, cluster using volume and surface"],

["What is the volume for all objects within the dataset?",
"Load dataset, calculate volume"],

["How many objects in the dataset have a specific ID?",
"Load dataset, count using ID"],

["Predict volume based on height through regression analysis.",
"Load dataset, calculate volume, split dataset, perform regression volume using height, calculate MSE"],

["Cluster objects based on height and mass within the dataset.",
"Load dataset, cluster using height and mass"],

["Estimate volume using surface and height through regression.",
"Load dataset, calculate volume, split dataset, perform regression density using surface and mass, calculate MSE"],


["How many clusters can be identified within the dataset based on volume?",
"Load dataset, calculate volume, cluster, count clusters"]
]


In [52]:
embeddings = Embeddings(content=True, autoid="uuid5")

In [53]:
embeddings.index(QA)

In [54]:
llm = LLM("TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q4_K_M.gguf", verbose=False)

In [55]:
template = """<|im_start|>system
You are an assistant that will rewrite sentences from users using a specific vocabulary.<|im_end|>
<|im_start|>user
Find the best matching text in the context for the input, try to find something similar in the "original" part of the context. 
The response should be the corresponding "rewrited" part from the context, you can slightly adapt if needed (attribute names).

Question:
{question}

Context:
{context}

Text:
<|im_end|>
<|im_start|>assistant
"""

In [56]:
extractor = Extractor(embeddings, llm, output="reference", separator="\n", template=template)

In [57]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [58]:
result = extractor("Predict height based on volume through regression.")

In [59]:
print("ANSWER:", result["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result["reference"]}))


ANSWER: Predict volume based on height through regression analysis. Load dataset, calculate volume, split dataset, perform regression volume using height, calculate MSE
REFERENCE: [{'id': '3de91233-a56b-5184-8163-fbef86296785', 'text': 'Predict volume based on height through regression analysis. Load dataset, calculate volume, split dataset, perform regression volume using height, calculate MSE'}]


In [60]:
result2 = extractor("How many different types of objects based on volume and height.")

In [61]:
print("ANSWER:", result2["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result2["reference"]}))


ANSWER: <|im_end|>
Identify different types of objects based on volume and surface area. Load dataset, calculate volume, cluster using volume and surface area
Given the objects dataset, how many types of objects can be found, according to their volume? Load dataset, calculate volume, cluster, count cluster
Determine the variety of objects in the dataset based on their volume. Load dataset, calculate volume, cluster, count clusters
REFERENCE: [{'id': '684fe487-98c6-52bf-bf21-1ff89a7c403c', 'text': 'Given the objects dataset, how many types of objects can be found, according to their volume? Load dataset, calculate volume, cluster, count cluster'}]


In [62]:
result7 = extractor("Train a model that can estimate volume based on surface given the objects dataset.")
print("ANSWER:", result7["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result7["reference"]}))

ANSWER: Train a model that can estimate volume based on surface given the objects dataset.
REFERENCE: [{'id': '87d65422-c215-549e-94a5-f2afb0115ff4', 'text': 'Train a model that can estimate density based on surface and mass given the objects dataset. Load dataset, calculate density, split dataset, perform regression density using surface and mass, calculate MSE'}]


In [63]:
result3 = extractor("Identify different types of objects based on volume and height.")
print("ANSWER:", result3["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result3["reference"]}))

ANSWER: Given the objects dataset, identify different types of objects based on volume and height. Load dataset, calculate volume, cluster using volume and height, determine variety of objects in dataset.
REFERENCE: [{'id': 'c9fcc043-9fb6-576b-bf20-f7cc32de740c', 'text': 'Identify different types of objects based on volume and surface area. Load dataset, calculate volume, cluster using volume and surface'}]


In [64]:
result4 = extractor("Determine the density of objects using volume and mass data.")
print("ANSWER:", result4["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result4["reference"]}))

ANSWER: Calculate the density of objects using their volume and mass from the dataset. Load dataset, calculate density
REFERENCE: [{'id': '35e603b9-eae8-56d5-9f37-6df82bc62837', 'text': 'Calculate the density of objects using their volume and mass from the dataset. Load dataset, calculate density'}]


In [65]:
result5 = extractor("Cluster objects based on volume and surface area within the dataset.")
print("ANSWER:", result5["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result5["reference"]}))

ANSWER: Cluster objects based on volume and surface area within the dataset.
REFERENCE: [{'id': 'c9fcc043-9fb6-576b-bf20-f7cc32de740c', 'text': 'Identify different types of objects based on volume and surface area. Load dataset, calculate volume, cluster using volume and surface'}]


In [66]:
result6 = extractor("Estimate height using volume and mass through regression analysis.")
print("ANSWER:", result6["answer"])
print("REFERENCE:", embeddings.search("select id, text from txtai where id = :id", parameters={"id": result6["reference"]}))

ANSWER: Estimate volume based on height through regression analysis. Load dataset, calculate volume, split dataset, perform regression volume using height, and calculate MSE
REFERENCE: [{'id': '3de91233-a56b-5184-8163-fbef86296785', 'text': 'Predict volume based on height through regression analysis. Load dataset, calculate volume, split dataset, perform regression volume using height, calculate MSE'}]
