In [14]:
%matplotlib inline

import datetime
import json
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import operator
import os
import pandas as pd
import seaborn as sns

from IPython.display import display, HTML
from kafka import KafkaConsumer, KafkaProducer
from kafka.structs import TopicPartition
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Dropout, Input
from keras.layers.recurrent import LSTM
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import TimeseriesGenerator
from numpy import hstack
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, QuantileTransformer
from keras.preprocessing.sequence import TimeseriesGenerator

# Make the graphs a bit prettier, and bigger
matplotlib.style.use('ggplot')

cfg = {
    'kafka': {
        'bootstrap_servers': {
            'in': ['127.0.0.1:9092'],
            'out': ['127.0.0.1:9092']
        },
        'topics': {
            'in': 'mods-agg-10m',
            'out': None
        }
    },
    'model': {
        'prediction_steps': 1,
        'transformation': 'none',
        'differential': False,
        'data': {
            'in': {
                'index':['ts'],
                'columns':[
                    'conn_count_uid_in',
                    'ssh_count_uid_in'
                ]
            },
            'out': [
                'conn_count_uid_in',
                'ssh_count_uid_in'
            ]
        },
        'train': {
            'split': 0.8,
            'tsg': {
                'length': 12,
                'sampling_rate': 1,
                'stride': 1,
                'batch_size': 1
            }
        }
    }
}

#
# computes time window for time t; i.e., <begin, end)
#
def epoch(t, period):
    days = period.days
    hours = math.floor(period.seconds / 3600)
    minutes = math.floor((period.seconds % 3600) / 60)
    seconds = period.seconds % 60
    beg = t - datetime.timedelta(
        days=t.day % days if days > 0 else 0,
        hours=t.hour % hours if hours > 0 else 0,
        minutes=t.minute % minutes if minutes > 0 else 0,
        seconds=t.second % seconds if seconds > 0 else t.second,
        microseconds=t.microsecond
    )
    end = beg + period
    return beg, end

In [15]:
consumer = KafkaConsumer(
    cfg['kafka']['topics']['in'],
    bootstrap_servers=cfg['kafka']['bootstrap_servers']['in'],
    consumer_timeout_ms=1000,
    value_deserializer=lambda v: json.loads(v.decode('utf-8'))
)

In [None]:
for tp in consumer.assignment():
    consumer.seek_to_beginning(tp)
    consumer.seek_to_beginning(tp)

features = cfg['model']['data']['in']['columns']
features_predicted = cfg['model']['data']['out']
context_length = cfg['model']['train']['tsg']['length'] + (cfg['model']['prediction_steps'] if cfg['model']['differential'] else 0)

In [None]:
# store incomming messages
buffer = pd.DataFrame([])
stripped_beg = False
for message in consumer:
    protocol = message.key.decode('ascii')
    df = pd.read_json(message.value, orient='index')
    df.set_index(cfg['model']['data']['in']['index'], inplace=True)
    df.index = pd.to_datetime(df.index, unit='ms')
    if df.empty: continue
    buffer = buffer.combine_first(df)
    if not stripped_beg and len(buffer.index) > 1 and buffer.iloc[[0]].isnull().values.any():
        buffer = buffer[1:]
        stripped_beg = True

In [21]:
buffer.to_csv('buffer-all-1M.tsv', sep='\t')
display(buffer)

Unnamed: 0_level_0,conn_count_uid_in,conn_count_uid_internal,conn_count_uid_out,conn_mean_duration_in,conn_mean_duration_internal,conn_mean_duration_out,conn_nunique_uid_in,conn_nunique_uid_internal,conn_nunique_uid_out,conn_sum_orig_bytes_in,...,ssh_count_uid_out,ssh_nunique_uid_in,ssh_nunique_uid_internal,ssh_nunique_uid_out,ssl_count_uid_in,ssl_count_uid_internal,ssl_count_uid_out,ssl_nunique_uid_in,ssl_nunique_uid_internal,ssl_nunique_uid_out
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-25 11:00:00,14263.0,2785.0,2481.0,7.256077,0.295692,1.412546,14263.0,2785.0,2481.0,4.148155e+06,...,,789.0,,,520.0,,144.0,520.0,,144.0
2021-03-25 11:10:00,13306.0,2783.0,3052.0,435.538292,0.390303,0.978375,13306.0,2783.0,3052.0,2.566967e+09,...,,664.0,,,537.0,,271.0,537.0,,271.0
2021-03-25 11:20:00,13001.0,2655.0,2299.0,4.563632,0.294706,0.792574,13001.0,2655.0,2299.0,7.787662e+09,...,,573.0,,,528.0,29.0,108.0,528.0,29.0,108.0
2021-03-25 11:30:00,13192.0,2583.0,2124.0,11.170176,0.320075,2.065534,13192.0,2583.0,2124.0,6.353765e+09,...,,661.0,,,437.0,,101.0,437.0,,101.0
2021-03-25 11:40:00,11424.0,2769.0,2414.0,6.562182,0.335777,1.266839,11424.0,2769.0,2414.0,3.519956e+09,...,,569.0,,,286.0,,98.0,286.0,,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-06 11:00:00,15486.0,2600.0,2153.0,10.709788,0.367481,3.689014,15486.0,2600.0,2153.0,4.165358e+06,...,,1790.0,,,274.0,,178.0,274.0,,178.0
2021-04-06 11:10:00,15892.0,2958.0,2571.0,35.777973,2.617615,2.036871,15892.0,2958.0,2571.0,4.091582e+06,...,,1091.0,,,273.0,,106.0,273.0,,106.0
2021-04-06 11:20:00,14198.0,2969.0,2369.0,3.156415,0.726195,2.039207,14198.0,2969.0,2369.0,1.317360e+09,...,,1506.0,,,261.0,,99.0,261.0,,99.0
2021-04-06 11:30:00,19576.0,3170.0,2465.0,3.681822,0.262235,1.775246,19576.0,3170.0,2465.0,3.297751e+09,...,,2547.0,,,349.0,198.0,111.0,349.0,198.0,111.0


In [None]:
plt.plot(buffer['conn_count_uid_in'].resample('12H').mean())
plt.plot(buffer['conn_count_uid_in'].resample('6H').mean())
plt.plot(buffer['conn_count_uid_in'].resample('3H').mean())