# Simulator for Demonstrating Handling Late Data and Watermarking

This notebook allows to experience how late arrivals and output modes interleaves in windowed aggregation queries.

**NOTE**: Run the early part of this notebook, first!

In [1]:
!pip install confluent-kafka==1.7.0

Collecting confluent-kafka==1.7.0
  Downloading confluent_kafka-1.7.0-cp38-cp38-manylinux2010_x86_64.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 3.9 MB/s eta 0:00:01
[?25hInstalling collected packages: confluent-kafka
Successfully installed confluent-kafka-1.7.0


In [2]:
from confluent_kafka.admin import AdminClient, NewTopic, NewPartitions
from confluent_kafka import KafkaException
import sys
from uuid import uuid4

In [3]:
bootstrap_server = "kafka:9092" # Brokers act as cluster entripoints

In [4]:
conf = {'bootstrap.servers': bootstrap_server}

In [5]:
a = AdminClient(conf)

In [6]:
md = a.list_topics(timeout=10)
print(" {} topics:".format(len(md.topics)))
for t in iter(md.topics.values()):
    if t.error is not None:
        errstr = ": {}".format(t.error)
    else:
        errstr = ""
    print("  \"{}\" with {} partition(s){}".format(t, len(t.partitions), errstr))

 1 topics:
  "_schemas" with 1 partition(s)


In [7]:
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import *
import time

topic = "words"

def delivery_report(err, msg):
    if err is not None:
        print("Failed to deliver message: {}".format(err))
    else:
        print("Produced record to topic {} partition [{}] @ offset {}"
              .format(msg.topic(), msg.partition(), msg.offset()))

In [8]:
producer_conf = {
        'bootstrap.servers': bootstrap_server,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': StringSerializer('utf_8')
}

producer = SerializingProducer(producer_conf)

## 1) send 12:07-12:08

In [9]:
import json
from random import gauss
from IPython.display import clear_output

key = None
value = {"word": "dog","ts":"2024-03-24T12:07:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

value = {"word": "owl","ts":"2024-03-24T12:08:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

{'word': 'dog', 'ts': '2024-03-24T12:07:00'}
Produced record to topic words partition [0] @ offset 0
{'word': 'owl', 'ts': '2024-03-24T12:08:00'}
Produced record to topic words partition [0] @ offset 1


1

## 2) send 12:14 and late arrival 12:09

In [10]:
value = {"word": "dog","ts":"2024-03-24T12:14:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

value = {"word": "cat","ts":"2024-03-24T12:09:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

{'word': 'dog', 'ts': '2024-03-24T12:14:00'}
Produced record to topic words partition [0] @ offset 2
{'word': 'cat', 'ts': '2024-03-24T12:09:00'}
Produced record to topic words partition [0] @ offset 3


1

## 3) send 12:15 and late arrivals 12:08 and 12:13

In [11]:
key = None
value = {"word": "cat","ts":"2024-03-24T12:15:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)
value = {"word": "dog","ts":"2024-03-24T12:08:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)
value = {"word": "owl","ts":"2024-03-24T12:13:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

{'word': 'cat', 'ts': '2024-03-24T12:15:00'}
Produced record to topic words partition [0] @ offset 4
{'word': 'dog', 'ts': '2024-03-24T12:08:00'}
Produced record to topic words partition [0] @ offset 5
{'word': 'owl', 'ts': '2024-03-24T12:13:00'}
Produced record to topic words partition [0] @ offset 6


1

## 4) send 12:26, a late arrival 12:17, and too late arrival 12:04

In [12]:
value = {"word": "owl","ts":"2024-03-24T12:26:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)
value = {"word": "owl","ts":"2024-03-24T12:17:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)
value = {"word": "donkey","ts":"2024-03-24T12:04:00"}
producer.produce(topic=topic, value=json.dumps(value), key=key, on_delivery=delivery_report)
print(value)
producer.poll(1)

{'word': 'owl', 'ts': '2024-03-24T12:26:00'}
Produced record to topic words partition [0] @ offset 7
{'word': 'owl', 'ts': '2024-03-24T12:17:00'}
Produced record to topic words partition [0] @ offset 8
{'word': 'donkey', 'ts': '2024-03-24T12:04:00'}
Produced record to topic words partition [0] @ offset 9


1

Remember to clean up!

In [None]:
# delete topic
ds = a.delete_topics([topic], operation_timeout=30)
for t, f in ds.items():
    try:
        f.result()  # The result itself is None
        print("Topic {} deleted".format(t))
    except Exception as e:
        print("Failed to delete topic {}: {}".format(t, e))