# Kafka Producer - Event Replay

Replay training ratings to Kafka for streaming processing.

In [None]:
import os
import json
import time
import threading
from pathlib import Path
from dataclasses import dataclass, asdict

import pandas as pd
from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin import NewTopic
from kafka.errors import NoBrokersAvailable, TopicAlreadyExistsError
from tqdm import tqdm

In [None]:
# We are in docker with env vairables set but as a failsafe we will have this.
KAFKA_SERVERS = os.getenv(
    "KAFKA_BOOTSTRAP_SERVERS",
    "kafka-broker-1:19092,kafka-broker-2:19092,kafka-broker-3:19092"
)
TOPIC = "movielens-ratings"
DATA_PATH = Path("../data/processed/train_ratings_session_ordered.csv")

# replay settings
EVENTS_PER_SEC = 10000  # adjust based on cluster capacity

print(f"Kafka: {KAFKA_SERVERS}")
print(f"Topic: {TOPIC}")
print(f"Data: {DATA_PATH}")

## Setup Kafka

In [None]:
def wait_for_kafka(servers, retries=30):
    for i in range(retries):
        try:
            admin = KafkaAdminClient(bootstrap_servers=servers)
            admin.close()
            print(f"Kafka ready (attempt {i+1})")
            return True
        except NoBrokersAvailable:
            print(f"Waiting... ({i+1}/{retries})")
            time.sleep(2)
    return False

def create_topic(servers, topic, partitions=8):
    rf = 2 if "," in servers else 1
    try:
        admin = KafkaAdminClient(bootstrap_servers=servers)
        admin.create_topics([NewTopic(topic, partitions, rf)])
        print(f"Created topic: {topic} ({partitions} partitions, rf={rf})")
        admin.close()
    except TopicAlreadyExistsError:
        print(f"Topic exists: {topic}")

if wait_for_kafka(KAFKA_SERVERS):
    create_topic(KAFKA_SERVERS, TOPIC)
else:
    print("Kafka not available.")

## Event Schema

In [None]:
@dataclass
class RatingEvent:
    user_id: int
    movie_id: int
    rating: float
    original_timestamp: int
    event_timestamp: int
    event_id: str

    def to_json(self):
        return json.dumps(asdict(self)).encode('utf-8')

    @classmethod
    def from_row(cls, row, idx):
        return cls(
            user_id=int(row['userId']),
            movie_id=int(row['movieId']),
            rating=float(row['rating']),
            original_timestamp=int(row['timestamp']),
            event_timestamp=int(time.time() * 1000),
            event_id=f"evt-{idx:010d}"
        )

## Rate-Limited Producer

In [None]:
class RateLimitedProducer:
    
    def __init__(self, servers, topic, rate):
        self.producer = KafkaProducer(
            bootstrap_servers=servers,
            key_serializer=lambda k: str(k).encode('utf-8') if k else None,
            acks='all',
            batch_size=16384,
            linger_ms=10
        )
        self.topic = topic
        self.rate = rate
        self.tokens = rate
        self.last_refill = time.time()
        self.lock = threading.Lock()
        self.sent = 0
        self.errors = 0
        self.start_time = None

    def _wait_for_token(self):
        while True:
            with self.lock:
                now = time.time()
                self.tokens += (now - self.last_refill) * self.rate
                self.tokens = min(self.tokens, self.rate * 2)
                self.last_refill = now
                if self.tokens >= 1:
                    self.tokens -= 1
                    return
            time.sleep(0.001)

    def send(self, event):
        self._wait_for_token()
        try:
            self.producer.send(self.topic, key=event.user_id, value=event.to_json())
            self.sent += 1
        except Exception:
            self.errors += 1

    def flush(self):
        self.producer.flush()

    def close(self):
        self.flush()
        self.producer.close()

    def stats(self):
        elapsed = time.time() - self.start_time if self.start_time else 0
        return {
            'sent': self.sent,
            'elapsed': elapsed,
            'rate': self.sent / elapsed if elapsed > 0 else 0,
            'errors': self.errors
        }

## Run Replay

In [None]:
def replay_events(data_path, servers, topic, rate, max_events=None):
    
    # load data
    print(f"Loading {data_path}...")
    df = pd.read_csv(data_path)
    if max_events:
        df = df.head(max_events)
    total = len(df)
    print(f"Loaded {total:,} ratings")
    
    # estimate time
    est_mins = total / rate / 60
    print(f"Target rate: {rate:,}/sec, estimated time: {est_mins:.1f} min")
    
    # replay
    producer = RateLimitedProducer(servers, topic, rate)
    producer.start_time = time.time()
    
    try:
        with tqdm(total=total, desc="Replaying", unit="events") as pbar:
            for idx, row in df.iterrows():
                event = RatingEvent.from_row(row, idx)
                producer.send(event)
                pbar.update(1)
                
                if producer.sent % 50000 == 0:
                    s = producer.stats()
                    pbar.set_postfix({'rate': f"{s['rate']:.0f}/s"})
    finally:
        producer.flush()
        stats = producer.stats()
        producer.close()
    
    print("REPLAY DONE")
    print("-"*50)
    print(f"Events: {stats['sent']:,}")
    print(f"Time: {stats['elapsed']:.1f}s")
    print(f"Rate: {stats['rate']:.1f}/sec")
    print(f"Errors: {stats['errors']}")
    return stats

In [None]:
# run full replay
stats = replay_events(
    data_path=DATA_PATH,
    servers=KAFKA_SERVERS,
    topic=TOPIC,
    rate=EVENTS_PER_SEC
)