In [4]:
from kafka import KafkaProducer
import pandas as pd
import json
import time

# Конфигурация
TOPIC = 'trip-data'
PARQUET_PATH = '/home/user/Project_DE/ETL/yellow_tripdata_2023-01.parquet'  

# Создаем Kafka-производителя
producer = KafkaProducer(
    bootstrap_servers='localhost:9092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Читаем Parquet-файл с помощью Pandas
df = pd.read_parquet(PARQUET_PATH)

# Можно убрать слишком длинные или ненужные поля, если хочешь:
# df = df[["vendorid", "lpep_pickup_datetime", "lpep_dropoff_datetime", "passenger_count", "total_amount"]]

# Проходимся по строкам DataFrame и отправляем их в Kafka
for _, row in df.iterrows():
    message = row.to_dict()

    # Конвертируем Timestamp → ISO для сериализации
    for key, value in message.items():
        if isinstance(value, pd.Timestamp):
            message[key] = value.isoformat()
        elif pd.isna(value):
            message[key] = None  # Убираем NaN

    producer.send(TOPIC, message)
    print(f"Sent: {message}")
    time.sleep(0.5)  # можно сделать быстрее, если нужно

producer.flush()




Sent: {'VendorID': 2, 'tpep_pickup_datetime': '2023-01-01T00:32:10', 'tpep_dropoff_datetime': '2023-01-01T00:40:36', 'passenger_count': 1.0, 'trip_distance': 0.97, 'RatecodeID': 1.0, 'store_and_fwd_flag': 'N', 'PULocationID': 161, 'DOLocationID': 141, 'payment_type': 2, 'fare_amount': 9.3, 'extra': 1.0, 'mta_tax': 0.5, 'tip_amount': 0.0, 'tolls_amount': 0.0, 'improvement_surcharge': 1.0, 'total_amount': 14.3, 'congestion_surcharge': 2.5, 'airport_fee': 0.0}
Sent: {'VendorID': 2, 'tpep_pickup_datetime': '2023-01-01T00:55:08', 'tpep_dropoff_datetime': '2023-01-01T01:01:27', 'passenger_count': 1.0, 'trip_distance': 1.1, 'RatecodeID': 1.0, 'store_and_fwd_flag': 'N', 'PULocationID': 43, 'DOLocationID': 237, 'payment_type': 1, 'fare_amount': 7.9, 'extra': 1.0, 'mta_tax': 0.5, 'tip_amount': 4.0, 'tolls_amount': 0.0, 'improvement_surcharge': 1.0, 'total_amount': 16.9, 'congestion_surcharge': 2.5, 'airport_fee': 0.0}
Sent: {'VendorID': 2, 'tpep_pickup_datetime': '2023-01-01T00:25:04', 'tpep_dro

KeyboardInterrupt: 