In [1]:
import pandas as pd
import json
from kafka import KafkaProducer
import time

In [2]:
KAFKA_TOPIC = "transactions"
KAFKA_BOOTSTRAP_SERVERS = "kafka:9092"

In [3]:
def stream_csv_to_kafka(csv_path: str, delay=0.5):
    df = pd.read_csv(csv_path)
    print(df.columns)

    # Cấu hình producer Kafka
    producer = KafkaProducer(
        bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
        value_serializer=lambda v: json.dumps(v).encode("utf-8"),
        key_serializer=lambda k: str(k).encode("utf-8")
    )

    print(f"📤 Bắt đầu gửi {len(df)} dòng vào Kafka topic '{KAFKA_TOPIC}'...")

    for i, row in df.iterrows():
        message = {
            "from_bank": int(row["From Bank"]),
            "from_acc": row["From Account"],
            "to_bank": int(row["To Bank"]),
            "to_acc": row["To Account"],
            "txn_id": int(row["txn_id"]),
            "amount_received": float(row["Amount Received"]),
            "receiving_currency": row["Receiving Currency"],
            "amount_paid": float(row["Amount Paid"]),
            "payment_currency": row["Payment Currency"],
            "payment_format": row["Payment Format"],
            "ts": row["ts"],
            "run_date": row["date_str"]
        }

        producer.send(
            KAFKA_TOPIC,
            key=message["from_bank"],
            value=message
        )
        print(f"✅ Đã gửi txn_id={message['txn_id']}")

        time.sleep(delay)  # giả lập streaming (chậm lại 0.5s mỗi bản ghi)

    producer.flush()
    print("🎉 Gửi xong tất cả message!")


In [4]:
csv_path = "./2022-09-05.csv"
stream_csv_to_kafka(csv_path, delay=10)

Index(['Timestamp', 'From Bank', 'From Account', 'To Bank', 'To Account',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering', 'ts', 'txn_id',
       'Bank', 'Account', 'year', 'month', 'day', 'date_str'],
      dtype='object')
📤 Bắt đầu gửi 66473 dòng vào Kafka topic 'transactions'...
✅ Đã gửi txn_id=1


KeyboardInterrupt: 

In [None]:
    # Lọc những dòng có is_laundering = 1
    laundering_df = df[df["Is Laundering"] == 1]
    nonlaundering_df = df[df["Is Laundering"] == 0]