In [1]:
import pandas as pd
from confluent_kafka import SerializingProducer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import StringSerializer

In [2]:
#Load csv into pandas dataframe
df = pd.read_csv('olist_orders_dataset.csv')

# Examine it's structure and contents
print(df.head())
print(df.dtypes)

                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp order_approved_at  \
0    delivered         02-10-2017 10:56  02-10-2017 11:07   
1    delivered         24-07-2018 20:41  26-07-2018 03:24   
2    delivered         08-08-2018 08:38  08-08-2018 08:55   
3    delivered         18-11-2017 19:28  18-11-2017 19:45   
4    delivered         13-02-2018 21:18  13-02-2018 22:20   

  order_delivered_carrier_date order_delivered_customer_date  \
0             04-10-2017 19:55              10-10-2017 21:25   
1             26-07-2018 14:31              07-08-2018 15:27   
2 

In [3]:
def delivery_report(err, msg):
    """
    Reports the failure or success of a message delivery.

    Args:
        err (KafkaError): The error that occurred on None on success.

        msg (Message): The message that was produced or failed.

    Note:
        In the delivery report callback the Message.key() and Message.value()
        will be the binary format as encoded by any configured Serializers and
        not the same object that was passed to produce().
        If you wish to pass the original object(s) for key and value to delivery
        report callback we recommend a bound callback or lambda where you pass
        the objects along.

    """
    if err is not None:
        print("Delivery failed for record {}: {}".format(msg.key(), err))
        return
    print('Record {} successfully produced to {} [{}] at offset {}'.format(
        msg.key(), msg.topic(), msg.partition(), msg.offset()))

In [4]:
# Define Kafka configuration
kafka_config = {
    'bootstrap.servers': 'pkc-41p56.asia-south1.gcp.confluent.cloud:9092',
    'sasl.mechanisms': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': 'VWF3LDWHI4SZPTMB',
    'sasl.password': 'Xm2e+b34daf+74ZovKjLinsadoZUvF/7tcl1ou8oRlrKJsWDEcLPGztXV4Xtipi5'
}

In [5]:
# Create a Schema Registry client
schema_registry_client = SchemaRegistryClient({
  'url': 'https://psrc-gk071.us-east-2.aws.confluent.cloud',
  'basic.auth.user.info': '{}:{}'.format('A7W6OLA7F7BVXXZJ', '8CePTTqwTTCOKlJHCEn4hcOZIi6LgZz6CN0GUpv7wz+VYFVaoyVAPbNTLLch8DML')
})


In [6]:
# Fetch the latest Avro schema for the value
subject_name = 'E-commerce_orders-value'
schema_str = schema_registry_client.get_latest_version(subject_name).schema.schema_str
print(schema_str)

{"type":"record","name":"Ecommerce","namespace":"com.kaggle.onlineretail","fields":[{"name":"order_id","type":"string"},{"name":"customer_id","type":"string"},{"name":"order_status","type":"string"},{"name":"order_purchase_timestamp","type":["long","int","string"]},{"name":"order_approved_at","type":["null","long","string"],"default":null},{"name":"order_delivered_carrier_date","type":["null","long","string"],"default":null},{"name":"order_delivered_customer_date","type":["null","long","string"],"default":null},{"name":"order_estimated_delivery_date","type":["null","long","string"],"default":null}]}


In [7]:
# Create Avro Serializer for the value
key_serializer = StringSerializer('utf_8')
avro_serializer = AvroSerializer(schema_registry_client, schema_str)

In [8]:
# Define the SerializingProducer
producer = SerializingProducer({
    'bootstrap.servers': kafka_config['bootstrap.servers'],
    'security.protocol': kafka_config['security.protocol'],
    'sasl.mechanisms': kafka_config['sasl.mechanisms'],
    'sasl.username': kafka_config['sasl.username'],
    'sasl.password': kafka_config['sasl.password'],
    'key.serializer': key_serializer,  # Key will be serialized as a string
    'value.serializer': avro_serializer  # Value will be serialized as Avro
})

In [9]:
count = 0
# Iterate over DataFrame rows and produce to Kafka
for index, row in df.iterrows():
    if count < 50:
        key = f"{row['customer_id']}_{row['order_id']}"
        # Replace NaN values with None in the DataFrame
        row = row.where(pd.notna(row), None)

        # Create a dictionary from the row values
        value = row.to_dict()
        # print(value)
        # Produce to Kafka
        producer.produce(topic='E-commerce_orders', key=str(key), value=value, on_delivery=delivery_report)
        producer.flush()
        count += 1

print("Data successfully published to Kafka")

Record b'9ef432eb6251297304e76186b10a928d_e481f51cbdc54678b7cc49136f2d6af7' successfully produced to E-commerce_orders [1] at offset 72
Record b'b0830fb4747a6c6d20dea0b8c802d7ef_53cdb2fc8bc7dce0b6741e2150273451' successfully produced to E-commerce_orders [1] at offset 73
Record b'41ce2a54c0b03bf3443c3d931a367089_47770eb9100c2d0c44946d9cf07ec65d' successfully produced to E-commerce_orders [4] at offset 96
Record b'f88197465ea7920adcdbec7375364d82_949d5b44dbf5de918fe9c16f97b45f8a' successfully produced to E-commerce_orders [2] at offset 104
Record b'8ab97904e6daea8866dbdbc4fb7aad2c_ad21c59c0840e6cb83a9ceb5573f8159' successfully produced to E-commerce_orders [4] at offset 97
Record b'503740e9ca751ccdda7ba28e9ab8f608_a4591c265e18cb1dcee52889e2d8acc3' successfully produced to E-commerce_orders [1] at offset 74
Record b'ed0271e0b7da060a393796590e7b737a_136cce7faa42fdb2cefd53fdc79a6098' successfully produced to E-commerce_orders [3] at offset 56
Record b'9bdf08b4b3b52b5526ff42d37d47f222_6514b