In [1]:
from haystack.pipeline import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy



document_store = InMemoryDocumentStore()

link_fetcher = LinkContentFetcher()
converter = HTMLToDocument()
splitter = DocumentSplitter(split_length=150, split_overlap=5, split_by="sentence")
embedder = SentenceTransformersDocumentEmbedder()
writer = DocumentWriter(document_store=document_store, 
                        policy=DuplicatePolicy.OVERWRITE)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("link_fetcher", link_fetcher)
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("link_fetcher", "converter")
indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")

<haystack.pipeline.Pipeline at 0x7f807d31a020>

In [2]:
indexing_pipeline.run(data={"link_fetcher":{"urls": ["https://bytewax.io/docs/getting-started/overview",
                                                     "https://bytewax.io/docs/getting-started/simple-example",
                                                     "https://bytewax.io/docs/getting-started/window-collect-example",
                                                     "https://bytewax.io/docs/getting-started/join-example",
                                                     "https://bytewax.io/docs/getting-started/wordcount-example",
                                                     "https://bytewax.io/docs/getting-started/polling-input-example",
                                                     "https://bytewax.io/docs/getting-started/recovering-snapshot",
                                                     "https://bytewax.io/docs/concepts/workers-and-parallelization",
                                                    "https://bytewax.io/docs/concepts/dataflow-programming",
                                                    "https://bytewax.io/docs/concepts/joins",
                                                    "https://bytewax.io/docs/concepts/windowing",
                                                    "https://bytewax.io/docs/concepts/recovery",
                                                    "https://bytewax.io/docs/concepts/rescaling",
                                                    "https://bytewax.io/docs/concepts/pubsub-to-polling",
                                                     "https://bytewax.io/docs/getting-started/execution",
                                                    "https://bytewax.io/apidocs/bytewax.operators/index",
                                                    "https://bytewax.io/apidocs/bytewax.inputs",
                                                     "https://bytewax.io/apidocs/bytewax.outputs",
                                                     "https://bytewax.io/apidocs/bytewax.connectors/index",
                                                     "https://bytewax.io/apidocs/bytewax.connectors/demo",
                                                     "https://bytewax.io/apidocs/bytewax.connectors/files",
                                                     "https://bytewax.io/apidocs/bytewax.connectors/kafka/index",
                                                     "https://bytewax.io/apidocs/bytewax.connectors/stdio"]}})

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

{'writer': {'documents_written': 46}}

In [3]:
from dotenv import load_dotenv
import os

load_dotenv("./../../.env")

open_ai_key = os.getenv("OPENAI_API_KEY")

In [8]:
import torch

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import GPTGenerator

######## Complete this section #############
prompt_template = """
You are an expert Python software engineer, you are asked to write code, 
explain code and you use the context provided to generate accurate and functional code along with clear explanations.
After you define a class, you also provide examples of using the class and its methods.
You must only use information from the given documents and cite the documents you used by mentioning their URL in the answer.
For example, begin your answer with ‘As stated in URL, ...’.
If the documents do not contain the answer to the question, say that ‘Answer is unknown.’
Context:
{% for doc in documents %}
    Document: {{ doc.content }} URL: {{ doc.meta['url'] }} \n
{% endfor %};
Question: {{query}}
\nAs stated in
"""
prompt_builder = PromptBuilder(prompt_template)
############################################
query_embedder = SentenceTransformersTextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=2)
llm = GPTGenerator(model='gpt-4')

  instance = super().__call__(*args, **kwargs)


In [9]:
pipeline = Pipeline()
pipeline.add_component(instance=query_embedder, name="query_embedder")
pipeline.add_component(instance=retriever, name="retriever")
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=llm, name="llm")

pipeline.connect("query_embedder.embedding", "retriever.query_embedding")
pipeline.connect("retriever.documents", "prompt_builder.documents")
pipeline.connect("prompt_builder", "llm")

<haystack.pipeline.Pipeline at 0x7f807e0ce290>

In [11]:
question = "Write a custom input connector to a datasource using the input API for reference. Use the simple examples provided in the documentation to guide you."
result = pipeline.run(data={"query_embedder": {"text": question}, "prompt_builder": {"query": question}})
print(result['llm']['replies'][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

https://bytewax.io/apidocs/bytewax.inputs and https://bytewax.io/apidocs/bytewax.connectors/index, you would need to subclass from the relevant Source class to implement a custom connector. Let's create a simple converter that pulls data from a hypothetical REST API. 

We will use the `SimplePollingSource` since this case involves periodic data retrieval from an external system: 

```python
import requests
from datetime import timedelta
from bytewax.connectors import SimplePollingSource

class RestApiSource(SimplePollingSource):
    def __init__(self, url):
        super().__init__(timedelta(seconds=10))  # 10-second interval
        self.url = url

    def next_item(self):
        res = requests.get(self.url)
        if not res.ok:
            raise SimplePollingSource.Retry(timedelta(seconds=1))

        return res.json()  # Assuming the API returns a JSON response
```
In the above example, the `RestApiSource` class is a custom connector that retrieves data from a provided REST API `

In [13]:
question2 = "Operators can be stateful (they keep track of what they have seen) or stateless (they have no concept of what they have seen). \
    Choose one of the stateful bytewax operators (windows, aggregations, joins, etc.) and describe how it works and when you would use it."
result2 = pipeline.run(data={"query_embedder": {"text": question2}, "prompt_builder": {"query": question2}})
print(result2['llm']['replies'][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

https://bytewax.io/apidocs/bytewax.operators/index, one of the stateful Bytewax operators is the "join" operator. 

The join operator is used to combine two data streams based on a certain criterion - often an id or some other shared characteristic. Its purpose is to correlate data from different sources that share a common set of identifiers.

In Python's Bytewax, there isn't a direct equivalent of the "join" operator, but you can achieve similar functionality using the concept of branching with the "branch" operator along with the "merge" operator.

Consider a scenario where you have two streams of data, representing the orders made by customers, and customer details, respectively. If each order includes the customer_id, you can use the "branch" operator to separate the orders based on customer_id. Then, use the "merge" operator to combine the two branches of a particular customer_id, effectively creating a joined stream that includes both the customer's details and their orders.

Th