# Real time stream processing with Kafka - Part 1.2
## Thanasak Harisombut

Date: 19/10/2020

Version: 1.0

Environment: Python 3.7.4 and Jupyter notebook


## 1.2 Memory Event Producer

In [1]:
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
# import pandas as pd
import csv


def read_csv(fileName, machine_id):
    '''Read the CSV file'''
    data_by_machine = {}
    for mc in machine_id:
        data_by_machine[mc] = []
        
    # read file and distribute by machine_id
    file = csv.DictReader(open(fileName))
    for row in file:
        data_by_machine[row['machine']].append(row)
        
    return data_by_machine


def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print(str(data)) 
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [3]:
if __name__ == '__main__':
    
    # initial varibles and set starting index for each machine
    topic = 'Streaming_Linux_memory'
    machine_id = ['4','5','6','7','8']
    data_by_machine = {}
    cur_inx = {}
    for mc in machine_id:
        cur_inx[mc] = 0
    
    # load file csv
    data_by_machine = read_csv("Streaming_Linux_memory.csv", machine_id)
    
    
    print('Publishing records..')
    producer = connect_kafka_producer()
    
    x_data = []
    y_data = []
    
    while True:
        # generate utc timestamp
        current_utc = dt.datetime.utcnow()
        ts = str(int(current_utc.timestamp()))
        
        
        #=== building x_data ===
        x_data = []
        
        # loop for each machine
        for mc in machine_id:
            # generate X interval between 20 to 80
            x_interval = random.randint(20,80)
    
            len_data = len(data_by_machine[mc])  # get maximum of machine
            
            # loop interval
            for i in range(x_interval):

                # select data from current index
                tmp_data = data_by_machine[mc][cur_inx[mc]]
        
                # add unix-timestamp to data
                tmp_data['ts'] = ts
                
                # append data to list
                x_data.append(tmp_data)
                
                if cur_inx[mc] + 1 < len_data:
                    cur_inx[mc] += 1   # set next index
                else:
                    cur_inx[mc] = 0    # reset index
        
        
        # merge x_data and y_data
        selected_data = x_data + y_data
        
        #=== publish data ===
#         print(cur_inx, len(selected_data)) # check number of records
        publish_message(producer, topic, selected_data)

        
        #=== building y_data ===
        y_data = []
        
        # loop for each machine
        for mc in machine_id:
            # generate Y interval between 0 to 5
            y_interval = random.randint(0,5)
            len_data = len(data_by_machine[mc])  # get maximum of machine
            
            # loop interval
            for i in range(y_interval):

                # select data from current index
                tmp_data = data_by_machine[mc][cur_inx[mc]]
        
                # add unix-timestamp to data
                tmp_data['ts'] = ts
                
                # append data to list
                y_data.append(tmp_data)
                
                if cur_inx[mc] + 1 < len_data:
                    cur_inx[mc] += 1   # set next index
                else:
                    cur_inx[mc] = 0    # reset index
        
        # wait for 10 secound
        sleep(10)
        
   

Publishing records..
[{'sequence': '1', 'machine': '4', 'PID': '4392', 'MINFLT': '221', 'MAJFLT': '0', 'VSTEXT': '596', 'VSIZE': '184.3', 'RSIZE': '5668', 'VGROW': '184.3', 'RGROW': '5668', 'MEM': '0.0', 'CMD': 'apache2', 'ts': '1604188643'}, {'sequence': '2', 'machine': '4', 'PID': '4397', 'MINFLT': '221', 'MAJFLT': '0', 'VSTEXT': '596', 'VSIZE': '184.3', 'RSIZE': '5668', 'VGROW': '184.3', 'RGROW': '5668', 'MEM': '0.0', 'CMD': 'apache2', 'ts': '1604188643'}, {'sequence': '3', 'machine': '4', 'PID': '4398', 'MINFLT': '221', 'MAJFLT': '0', 'VSTEXT': '596', 'VSIZE': '184.3', 'RSIZE': '5668', 'VGROW': '184.3', 'RGROW': '5668', 'MEM': '0.0', 'CMD': 'apache2', 'ts': '1604188643'}, {'sequence': '4', 'machine': '4', 'PID': '4400', 'MINFLT': '221', 'MAJFLT': '0', 'VSTEXT': '596', 'VSIZE': '184.3', 'RSIZE': '5668', 'VGROW': '184.3', 'RGROW': '5668', 'MEM': '0.0', 'CMD': 'apache2', 'ts': '1604188643'}, {'sequence': '5', 'machine': '4', 'PID': '4401', 'MINFLT': '221', 'MAJFLT': '0', 'VSTEXT': '59

KeyboardInterrupt: 