In [8]:
import json
import uuid

from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin.new_topic import NewTopic
from kafka.errors import TopicAlreadyExistsError

### Configuration Parameters 

> **TODO:** Change the configuration prameters to the appropriate values for your setup.

In [9]:
config = dict(
    bootstrap_servers=['kafka.kafka.svc.cluster.local:9092'],
    first_name='Scott',
    last_name='Breitbach'
)

config['client_id'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)
config['topic_prefix'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)

config

{'bootstrap_servers': ['kafka.kafka.svc.cluster.local:9092'],
 'first_name': 'Scott',
 'last_name': 'Breitbach',
 'client_id': 'BreitbachScott',
 'topic_prefix': 'BreitbachScott'}

### Create Topic Utility Function

The `create_kafka_topic` helps create a Kafka topic based on your configuration settings.  For instance, if your first name is *John* and your last name is *Doe*, `create_kafka_topic('locations')` will create a topic with the name `DoeJohn-locations`.  The function will not create the topic if it already exists. 

In [1]:
# Load libraries
import pyarrow.parquet as pq
from collections import namedtuple
# Set path to data
src_data_path = '/home/jovyan/dsc650/data/processed/bdd/'

# Load acceleration data into pandas dataframe
accel_df = pq.ParquetDataset(
    src_data_path + 'accelerations/').read_pandas().to_pandas()

# Reorder columns
accel_cols = [
    'offset',
    'ride_id',
    'uuid', 
    'timestamp', 
    'x', 'y', 'z',
    'timelapse', 
    'filename', 
    't'
]

# Order df by specified columns & sort by offset value
accel_df = accel_df[accel_cols].sort_values(by=['offset'])

In [2]:
accel_df.head(1)

Unnamed: 0,offset,ride_id,uuid,timestamp,x,y,z,timelapse,filename,t
0,0.822061,c9a2b46c9aa515b632eddc45c4868482,19b9aa10588646b3bf22c9b4865a7995,1970-01-01 00:25:03.882586,-0.994,0.045,-0.036,False,e2f795a7-6a7d-4500-b5d7-4569de996811.mov,0.0


In [18]:
# accel_df

In [6]:
from heapq import heappush, heappop

In [7]:
def heapsort(iterable):
    h = []
    for value in iterable:
        heappush(h, value)
    return [heappop(h) for i in range(len(h))]

heapsort([1, 3, 5, 7, 9, 2, 4, 6, 8, 0])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
# Define named tuple
Accelerations = namedtuple('Accelerations', accel_cols)

# Assign records to named tuple
records = [Accelerations(*record) for record in accel_df.to_records(index=False)]

In [10]:
records[0]

Accelerations(offset=0.8220608865228429, ride_id='c9a2b46c9aa515b632eddc45c4868482', uuid='19b9aa10588646b3bf22c9b4865a7995', timestamp=numpy.datetime64('1970-01-01T00:25:03.882586000'), x=-0.994, y=0.045, z=-0.036000000000000004, timelapse=False, filename='e2f795a7-6a7d-4500-b5d7-4569de996811.mov', t='000.0')

In [11]:
h = []
for i in records:
    heappush(h, i)

In [12]:
heappop(h)

Accelerations(offset=0.8220608865228429, ride_id='c9a2b46c9aa515b632eddc45c4868482', uuid='19b9aa10588646b3bf22c9b4865a7995', timestamp=numpy.datetime64('1970-01-01T00:25:03.882586000'), x=-0.994, y=0.045, z=-0.036000000000000004, timelapse=False, filename='e2f795a7-6a7d-4500-b5d7-4569de996811.mov', t='000.0')

In [13]:
heappop(h)

Accelerations(offset=0.8420608865228429, ride_id='c9a2b46c9aa515b632eddc45c4868482', uuid='19b9aa10588646b3bf22c9b4865a7995', timestamp=numpy.datetime64('1970-01-01T00:25:03.882586000'), x=-0.998, y=0.046, z=-0.04, timelapse=False, filename='e2f795a7-6a7d-4500-b5d7-4569de996811.mov', t='000.0')

In [14]:
heappop(h)

Accelerations(offset=0.862060886522843, ride_id='c9a2b46c9aa515b632eddc45c4868482', uuid='19b9aa10588646b3bf22c9b4865a7995', timestamp=numpy.datetime64('1970-01-01T00:25:03.882586000'), x=-0.999, y=0.047, z=-0.036000000000000004, timelapse=False, filename='e2f795a7-6a7d-4500-b5d7-4569de996811.mov', t='000.0')

In [16]:
heappop(h).offset

0.882060886522843

In [22]:
import time

In [19]:
seconds = time.time()
print('seconds since epoch =', seconds)

seconds since epoch = 1651878285.4538746


In [24]:
seconds = time.time()
print('seconds since epoch =', seconds)

seconds since epoch = 1651878385.0535262


In [25]:
# seconds passed since epoch
# seconds = 1545925769.9618232
local_time = time.ctime(seconds)
print("Local time:", local_time)

Local time: Fri May  6 23:06:25 2022


In [26]:
print("This is printed immediately.")
time.sleep(2.4)
print("This is printed after 2.4 seconds.")

This is printed immediately.
This is printed after 2.4 seconds.


In [28]:
t = [1, 2, 2, 3]

In [30]:
for i in t:
    time.sleep(i)
    print(i)

1
2
2
3


In [33]:
time_s = accel_df['offset'].unique()
time_s

array([  0.82206089,   0.84206089,   0.86206089, ..., 122.45512141,
       122.46532815, 122.46989596])

In [35]:
test = time_s[0]
test

0.8220608865228429

In [39]:
len(accel_df)

23512

In [38]:
len(time_s)

23512

In [37]:
accel_df[accel_df['offset'] == test]

Unnamed: 0,offset,ride_id,uuid,timestamp,x,y,z,timelapse,filename,t
0,0.822061,c9a2b46c9aa515b632eddc45c4868482,19b9aa10588646b3bf22c9b4865a7995,1970-01-01 00:25:03.882586,-0.994,0.045,-0.036,False,e2f795a7-6a7d-4500-b5d7-4569de996811.mov,0.0


In [40]:
##NOPE
# time_start = time.time()
# for i in t:
#     if (time.time()-time_start) == i:
#         print(i)

In [None]:
##YEP
# import time

# test = [1, 1, 4, 5, 8]
# time_start = time.time()
# for i in test:
#     while (time.time() - time_start < i):
#         pass
#     print(i)

In [41]:
import threading as th

In [50]:
def sctn():  
   print("test")  
S = th.Timer(1.0, sctn)  
S.start()  
print("done")

done
test


In [58]:
##Timers  
##Execute code at timed intervals  
##Imports and Displays  
import time  
from threading import Timer  
def display(msg):  
    print(msg + ' ' + time.strftime('%H:%M:%S'))  
  
##Basic timer  
def run_once():  
    display('run_once:')  
    t=Timer(1,display,['Timeout:'])  
    t.start()#Here run is called  
run_once()  
##Runs immediately and once  
print('Waiting.....')  
  
##Lets make our timer run in intervals  
##Put it into a class  
##Making it run until we stop it  
##Just getting crazy.Notice We have multiple timers at once!  
class RepeatTimer(Timer):  
    def run(self):  
        while not self.finished.wait(self.interval):  
            self.function(*self.args,**self.kwargs)  
            print(' ')  
##We are now creating a thread timer and controling it  
timer = RepeatTimer(.02,display,['Repeating'])  
timer.start() #recalling run  
print('Threading started')  
time.sleep(1)#It gets suspended for the given number of seconds  
print('Threading finishing')  
timer.cancel()

run_once: 00:05:36
Waiting.....
Threading started
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:36
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repeating 00:05:37
 
Repea

In [54]:
accel_df.offset[1] - accel_df.offset[0]

0.020000000000000018

In [55]:
accel_df.offset[2] - accel_df.offset[1]

0.020000000000000018

In [57]:
accel_df.offset[23502] - accel_df.offset[23501]

0.01999999999999602

In [59]:
len(accel_df)

23512

In [62]:
(accel_df.offset[23511] - accel_df.offset[0]) / len(accel_df)

0.005173182456561101

In [64]:
accel_df.shape

(23512, 10)

In [65]:
accel_df.loc[0]

offset                                       0.822061
ride_id              c9a2b46c9aa515b632eddc45c4868482
uuid                 19b9aa10588646b3bf22c9b4865a7995
timestamp                  1970-01-01 00:25:03.882586
x                                              -0.994
y                                               0.045
z                                              -0.036
timelapse                                       False
filename     e2f795a7-6a7d-4500-b5d7-4569de996811.mov
t                                               000.0
Name: 0, dtype: object

In [68]:
test = accel_df.loc[0].to_dict()
test

{'offset': 0.8220608865228429,
 'ride_id': 'c9a2b46c9aa515b632eddc45c4868482',
 'uuid': '19b9aa10588646b3bf22c9b4865a7995',
 'timestamp': Timestamp('1970-01-01 00:25:03.882586'),
 'x': -0.994,
 'y': 0.045,
 'z': -0.036000000000000004,
 'timelapse': False,
 'filename': 'e2f795a7-6a7d-4500-b5d7-4569de996811.mov',
 't': '000.0'}

In [69]:
test['offset']

0.8220608865228429

In [70]:
t

[1, 2, 2, 3]

In [72]:
time_start = time.time()
for i in t:
    while (time.time() - time_start) < i:
        pass
    print(i)

1
2
2
3


In [73]:
# Load location data into pandas dataframe
locat_df = pq.ParquetDataset(
    src_data_path + 'locations/').read_pandas().to_pandas()

In [74]:
# Reorder columns
locat_cols = [
    'offset',
    'id', 
    'ride_id', 
    'uuid', 
    'timestamp', 
    'course', 
    'latitude',
    'longitude', 
    'geohash', 
    'speed', 
    'accuracy', 
    'timelapse', 
    'filename',
    't'
]

# Order df by specified columns & sort by offset value
locat_df = locat_df[locat_cols].sort_values(by=['offset'])

In [83]:
locat_df.offset[4]

8.525060886522843

In [85]:
locat_df.offset.iloc[4]

8.077912529556645

In [77]:
test = locat_df.head()
test

Unnamed: 0,offset,id,ride_id,uuid,timestamp,course,latitude,longitude,geohash,speed,accuracy,timelapse,filename,t
1,1.077913,85c61911b7fe2ced1000c33c9e932706,6760ffa3f41908695d1405b776c3e8d5,dad7eae44e784b549c8c5a3aa051a8c7,1970-01-01 00:25:07.320453,158.203125,40.677641,-73.81793,dr5x2jpkmtcy,2.12,10.0,False,d745b92f-aefd-467d-9121-7a71308e8d6d.mov,0.0
0,1.525061,58682c5d48cad9d9e103431d773615bf,c9a2b46c9aa515b632eddc45c4868482,19b9aa10588646b3bf22c9b4865a7995,1970-01-01 00:25:03.882586,299.619141,40.76287,-73.961949,dr5ruuwscttz,0.0,10.0,False,e2f795a7-6a7d-4500-b5d7-4569de996811.mov,0.0
2,4.525061,58682c5d48cad9d9e103431d773615bf,c9a2b46c9aa515b632eddc45c4868482,19b9aa10588646b3bf22c9b4865a7995,1970-01-01 00:25:03.882583,299.619141,40.76287,-73.961949,dr5ruuwsctv3,0.0,10.0,False,e2f795a7-6a7d-4500-b5d7-4569de996811.mov,4.5
3,5.077913,85c61911b7fe2ced1000c33c9e932706,6760ffa3f41908695d1405b776c3e8d5,dad7eae44e784b549c8c5a3aa051a8c7,1970-01-01 00:25:07.320449,159.960938,40.677883,-73.818047,dr5x2jpmfffw,11.75,10.0,False,d745b92f-aefd-467d-9121-7a71308e8d6d.mov,4.5
5,8.077913,85c61911b7fe2ced1000c33c9e932706,6760ffa3f41908695d1405b776c3e8d5,dad7eae44e784b549c8c5a3aa051a8c7,1970-01-01 00:25:07.320446,159.609375,40.678191,-73.818193,dr5x2jppxkqj,13.15,10.0,False,d745b92f-aefd-467d-9121-7a71308e8d6d.mov,7.8


In [79]:
for i in range(len(test)):
    print(i)

0
1
2
3
4


In [86]:
test['offset'].iloc[4]

8.077912529556645

In [88]:
time_start = time.time()
for i in range(len(test)):
    j = test['offset'].iloc[i]
    while (time.time() - time_start) < j:
        pass
    print(j)

1.0779125295566454
1.525060886522843
4.5250608865228426
5.077912529556645
8.077912529556645


In [90]:
test = accel_df.loc[0].to_dict()
test

{'offset': 0.8220608865228429,
 'ride_id': 'c9a2b46c9aa515b632eddc45c4868482',
 'uuid': '19b9aa10588646b3bf22c9b4865a7995',
 'timestamp': Timestamp('1970-01-01 00:25:03.882586'),
 'x': -0.994,
 'y': 0.045,
 'z': -0.036000000000000004,
 'timelapse': False,
 'filename': 'e2f795a7-6a7d-4500-b5d7-4569de996811.mov',
 't': '000.0'}

In [3]:
def create_kafka_topic(topic_name, config=config, num_partitions=1, replication_factor=1):
    bootstrap_servers = config['bootstrap_servers']
    client_id = config['client_id']
    topic_prefix = config['topic_prefix']
    name = '{}-{}'.format(topic_prefix, topic_name)
    
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers, 
        client_id=client_id
    )
    
    topic = NewTopic(
        name=name,
        num_partitions=num_partitions,
        replication_factor=replication_factor
    )

    topic_list = [topic]
    try:
        admin_client.create_topics(new_topics=topic_list)
        print('Created topic "{}"'.format(name))
    except TopicAlreadyExistsError as e:
        print('Topic "{}" already exists'.format(name))
    
create_kafka_topic('locations')

Topic "DoeJohn-locations" already exists


### Kafka Producer

The following code creates a `KafkaProducer` object which you can use to send Python objects that are serialized as JSON.

**Note:** This producer serializes Python objects as JSON. This means that object must be JSON serializable.  As an example, Python `DateTime` values are not JSON serializable and must be converted to a string (e.g. ISO 8601) or a numeric value (e.g. a Unix timestamp) before being sent.

In [4]:
producer = KafkaProducer(
  bootstrap_servers=config['bootstrap_servers'],
  value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

### Send Data Function

The `send_data` function sends a Python object to a Kafka topic. This function adds the `topic_prefix` to the topic so `send_data('locations', data)` sends a JSON serialized message to `DoeJohn-locations`. The function also registers callbacks to let you know if the message has been sent or if an error has occured. 

In [5]:
def on_send_success(record_metadata):
    print('Message sent:\n    Topic: "{}"\n    Partition: {}\n    Offset: {}'.format(
        record_metadata.topic,
        record_metadata.partition,
        record_metadata.offset
    ))
    
def on_send_error(excp):
    print('I am an errback', exc_info=excp)
    # handle exception

def send_data(topic, data, config=config, producer=producer, msg_key=None):
    topic_prefix = config['topic_prefix']
    topic_name = '{}-{}'.format(topic_prefix, topic)
    
    if msg_key is not None:
        key = msg_key
    else:
        key = uuid.uuid4().hex
    
    producer.send(
        topic_name, 
        value=data,
        key=key.encode('utf-8')
    ).add_callback(on_send_success).add_errback(on_send_error)

In [7]:
example_data = dict(
    key1='value1',
    key2='value2'
)

send_data('locations', example_data)

Message sent:
    Topic: "DoeJohn-locations"
    Partition: 0
    Offset: 1467
