# 1. Producing the data  
In this task, we will implement Apache Kafka producers to simulate real-time data streaming. Spark and parallel data processing should not be used in this section, as we are simulating sensors that often lack processing capabilities.  

1.	Every 5 seconds, load 5 days of weather data from the CSV file. We refer to this as weather5s to explain the tasks; feel free to use your own variable name. You should keep a pointer in the file reading process and advance it per read. The data reading should be in chronological order.
2.	Add the current timestamp (weather_ts) to the weather5s and spread your batch out evenly for 5 seconds for each day. Since the weather data is hourly readings, each day you shall have 24 records (120 records in total for 5 days).
For example, assume you send the records at 2025-01-26 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1737810000):  
Day 1(records 1-24): ts = 1737810000  
Day 2(records 25-48): ts = 1737810001  
Day 3(records 49-72): ts = 1737810002  
…
3.	Send your batch of weather data to a Kafka topic with an appropriate name.




In [9]:
# import statements
from time import sleep
from json import dumps
from kafka3 import KafkaProducer
import random
from datetime import datetime
import csv
import io

#configuration
hostip = "10.192.89.180" #change to your machine IP address

topic = 'A2B'


def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8')
        value_bytes = bytes(value, encoding='utf-8')
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'],
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    

def read_csv_in_chunks(filepath, chunk_size):
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        headers = next(reader)  # Read header row once
        while True:
            chunk = []
            now = datetime.now().isoformat() # Same timestamp for all items in chunk
            for _ in range(chunk_size):
                try:
                    row = next(reader)
#                     row.append(datetime.now().isoformat()) # Timestamp for read time of each line
                    row.append(now) 
                    chunk.append(row)
                except StopIteration:
                    break
            if not chunk:
                break
#             print(headers, chunk)
            yield headers, chunk
            
def rows_to_csv_string(rows):
    buf = io.StringIO()
    writer = csv.writer(buf)
    writer.writerows(rows)
    return buf.getvalue().strip()

if __name__ == '__main__':   
    print('Publishing records..')
    producer = connect_kafka_producer()
    for headers, rows in read_csv_in_chunks("data/weather.csv", chunk_size=120):
        print("5 days:")
        print(rows_to_csv_string(rows))
        publish_message(producer, topic, 'parsed', rows_to_csv_string(rows))
        sleep(5)


Publishing records..
5 days:
0,2022-01-01 22:00:00.000,26.7,,18.3,1016.9,230.0,3.1,2025-10-07T08:19:42.551147
0,2022-01-01 23:00:00.000,25.6,,18.3,1017.5,230.0,3.1,2025-10-07T08:19:42.551147
0,2022-01-02 00:00:00.000,24.4,6.0,18.9,1018.1,270.0,2.6,2025-10-07T08:19:42.551147
0,2022-01-02 01:00:00.000,23.9,4.0,18.3,1018.5,300.0,2.1,2025-10-07T08:19:42.551147
0,2022-01-02 02:00:00.000,22.2,,19.4,,360.0,5.7,2025-10-07T08:19:42.551147
0,2022-01-02 03:00:00.000,21.1,,18.9,1019.5,20.0,5.1,2025-10-07T08:19:42.551147
0,2022-01-02 04:00:00.000,20.6,,17.8,1019.4,30.0,4.6,2025-10-07T08:19:42.551147
0,2022-01-02 05:00:00.000,19.4,4.0,17.2,1019.3,20.0,2.6,2025-10-07T08:19:42.551147
0,2022-01-02 06:00:00.000,18.9,6.0,17.2,1019.0,10.0,2.1,2025-10-07T08:19:42.551147
0,2022-01-02 07:00:00.000,18.9,,17.2,1018.4,10.0,2.6,2025-10-07T08:19:42.551147
0,2022-01-02 08:00:00.000,18.9,,16.7,1018.5,360.0,3.6,2025-10-07T08:19:42.551147
0,2022-01-02 09:00:00.000,18.3,,16.7,1018.1,10.0,3.1,2025-10-07T08:1

KeyboardInterrupt: 

In [None]:
# GPT
import csv

def read_csv_in_chunks(filepath, chunk_size):
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        headers = next(reader)  # Read header row once
        while True:
            chunk = []
            for _ in range(chunk_size):
                try:
                    row = next(reader)
                    chunk.append(row)
                except StopIteration:
                    break
            if not chunk:
                break
            yield headers, chunk
            
for headers, rows in read_csv_in_chunks("large_file.csv", chunk_size=1000):
    print(f"Read {len(rows)} rows")
    # process rows here



In [None]:
def read_csv_incrementally(filepath):
    """
    Reads a CSV file incrementally, yielding each row as a dictionary.
    """
    with open(filepath, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            yield row  # Yield each row for processing

In [None]:
with open('chocolate.csv') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        print(row)
        
with open('chocolate.csv') as f:
    dict_reader = csv.DictReader(f, delimiter=',')
    for row in dict_reader:
        print("The {} company is located in {}.".format(row['Company'], row['Company Location']))

In [None]:
def process_csv_incrementally(filepath):
    """
    Reads a CSV file incrementally and processes each row.

    Args:
        filepath (str): The path to the CSV file.
    """
    with open(filepath, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile)

        # Skip header row if present
        header = next(csv_reader, None)
        if header:
            print(f"Header: {header}")

        # Process each data row
        for row in csv_reader:
            # Perform operations on the current row
            print(f"Processing row: {row}")
            # Example: Access individual elements by index
            # name = row[0]
            # age = int(row[1])
            # ...