In [1]:
!pip install bytewax

Collecting bytewax
  Downloading bytewax-0.18.2-cp310-cp310-macosx_10_12_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hCollecting jsonpickle>=3 (from bytewax)
  Downloading jsonpickle-3.0.3-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonpickle, bytewax
Successfully installed bytewax-0.18.2 jsonpickle-3.0.3


In [19]:
import csv
from datetime import timedelta
from bytewax.inputs import SimplePollingSource

class CSVSource(SimplePollingSource):
    def __init__(self, filename):
        super().__init__(interval=timedelta(seconds=10))
        self.filename = filename

    def next_item(self):
        with open(self.filename, 'r') as file:
            reader = csv.reader(file)
            try:
                return next(reader)
            except StopIteration:
                raise SimplePollingSource.Retry(timedelta(seconds=1))



In [22]:
import pandas as pd

df = pd.read_csv("data.csv", encoding='latin1')

df.head(10).to_csv("data_10.csv", index=False)

df2 = pd.DataFrame()
df2.to_csv("data_empty.csv", index=False)

In [None]:
# Usage
source = CSVSource('data_empty.csv')
while True:
    print(source.next_item())

In [2]:
import requests
from datetime import timedelta
from bytewax.inputs import SimplePollingSource

class RestApiSource(SimplePollingSource):
    def __init__(self, url):
        super().__init__(timedelta(seconds=10))  # 10-second interval
        self.url = url

    def next_item(self):
        res = requests.get(self.url)
        if not res.ok:
            raise SimplePollingSource.Retry(timedelta(seconds=1))

        return res.json()

In [None]:
RestApiSource('https://api.github.com/search/repositories?q=llm+monitoring').next_item()

In [1]:
import requests
import logging
from datetime import timedelta
from bytewax.inputs import SimplePollingSource

class RestApiSource(SimplePollingSource):
    def __init__(self, url, poll_interval_seconds=10, retry_delay_seconds=1):
        super().__init__(timedelta(seconds=poll_interval_seconds))
        self.url = url
        self.retry_delay = timedelta(seconds=retry_delay_seconds)
        logging.basicConfig(level=logging.INFO)

    def next_item(self):
        try:
            res = requests.get(self.url)
            if res.ok:
                return res.json()
            else:
                logging.error(f"Failed to fetch data: {res.status_code} {res.text}")
                raise SimplePollingSource.Retry(self.retry_delay)
        except requests.RequestException as e:
            logging.error(f"Request failed: {e}")
            raise SimplePollingSource.Retry(self.retry_delay)


In [2]:
RestApiSource('https://api.github.com/search/reposiories?q=llm+monitoring').next_item()

ERROR:root:Failed to fetch data: 404 {"message":"Not Found","documentation_url":"https://docs.github.com/rest"}


Retry: 0:00:01

In [38]:
from bytewax.operators import join_named, map, input, inspect
from bytewax.testing import run_main, TestingSource
from bytewax.dataflow import Dataflow

flow = Dataflow("join_eg")

# Define sources of data - user ids, names, and emails
names_l = [
    {"user_id": 123, "name": "Bee"},
    {"user_id": 456, "name": "Hive"},
]

emails_l = [
    {"user_id": 123, "email": "bee@bytewax.io"},
    {"user_id": 456, "email": "hive@bytewax.io"},
    {"user_id": 789, "email": "queen@bytewax.io"},  
]

# Input streams
names = input("names", flow, TestingSource(names_l))
emails = input("emails", flow, TestingSource(emails_l))

# Convert each item into a (key, value) pair with string keys
keyed_names = map("key_names", names, lambda x: (str(x["user_id"]), {"name": x["name"]}))
keyed_emails = map("key_emails", emails, lambda x: (str(x["user_id"]), {"email": x["email"]}))

# Use the join_named operator with named streams
joined = join_named("join_names_emails", names=keyed_names, emails=keyed_emails)

# Inspect the joined data
inspect("inspect_joined", joined, lambda step_id, x: print(f"Joined Data: {x}"))

run_main(flow)


Joined Data: ('123', {'names': {'name': 'Bee'}, 'emails': {'email': 'bee@bytewax.io'}})
Joined Data: ('456', {'names': {'name': 'Hive'}, 'emails': {'email': 'hive@bytewax.io'}})


In [30]:
import bytewax.operators as op
from bytewax.testing import TestingSource, run_main
from bytewax.dataflow import Dataflow

# Define a dataflow
flow = Dataflow("count_final_eg")

# Define a source of data
inp = ["apple", "banana", "apple", "orange", "banana", "banana"]
s = op.input("inp", flow, TestingSource(inp))

# Define a function to convert each item into a string key
key_func = lambda x: x

# Use the count_final operator
s = op.count_final("count", s, key_func)

run_main(flow)

thread '<unnamed>' panicked at src/run.rs:116:17:
Box<dyn Any>



ValueError: (src/worker.rs:157:10) error building production dataflow
Caused by => ValueError: (src/worker.rs:531:24) Dataflow needs to contain at least one output or inspect step; add with `bytewax.operators.output` or `bytewax.operators.inspect`

In [43]:
import numpy as np
import random

from datetime import datetime, timedelta, timezone

from bytewax.dataflow import Dataflow
from bytewax.connectors.stdio import StdOutput

from bytewax.window import (
    EventClockConfig,
    SlidingWindow,
)

from bytewax.inputs import DynamicInput, StatelessSource

align_to = datetime(2023, 1, 1, tzinfo=timezone.utc)


class RandomNumpyData(StatelessSource):
    def __init__(self):
        self._it = enumerate(range(100))

    def next(self):
        i, item = next(self._it)
        if i % 5 == 0:
            return ("data", np.nan)
        else:
            return ("data", random.randint(0, 10))


class RandomNumpyInput(DynamicInput):
    def build(self, _worker_index, _worker_count):
        return RandomNumpyData()

ImportError: cannot import name 'StdOutput' from 'bytewax.connectors.stdio' (/Users/macpro/anaconda3/envs/llm-env/lib/python3.10/site-packages/bytewax/connectors/stdio.py)