In [None]:
#imports and display settings
import numpy as np
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', 1000)
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.10f' % x)

import time, calendar
import pytz, datetime
from datetime import timedelta, datetime
import sys, math

In [None]:
#set the seed
from numpy.random import seed
seed(1)
import tensorflow as tf

In [None]:
tf_version_major = int(tf.__version__.split(".")[0])

In [None]:
if tf_version_major >= 2:
    print ('Using tensorflow 2.x or greater')
    tf.random.set_seed(2)
else:
    print ('Using tensorflow 1.x')
    from tensorflow import set_random_seed
    set_random_seed(2)

In [None]:
# for plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def normalizeDataset(dataframe, columns):
    df = dataframe.copy()
    for var in columns:
        df[var] = np.log(df[var])
    return df

In [None]:
# Note: the original feature analysis was made in a separate Jupyter notebook. Below is just the summary of the results
outliers=['ThreadsClient']
target_var = 'TotalMessages'
dropColumns=['LoopId', 'LoopStartTime']

In [None]:
#read the aggregated client side datasets
summaryL1Z1 = pd.read_csv('summary-L1-zone1.csv')
summaryL1Z2 = pd.read_csv('summary-L1-zone2.csv')
summaryL2Z1 = pd.read_csv('summary-L2-zone1.csv')
summaryL2Z2 = pd.read_csv('summary-L2-zone2.csv')
summaryL3Z1 = pd.read_csv('summary-L3-zone1.csv')
summaryL3Z2 = pd.read_csv('summary-L3-zone2.csv')

In [None]:
allData = summaryL1Z1.copy()
allData = allData.append(summaryL1Z2)
allData = allData.append(summaryL2Z1)
allData = allData.append(summaryL2Z2)
allData = allData.append(summaryL3Z1)
allData = allData.append(summaryL3Z2)

In [None]:
allDataCopy = allData.copy()
allData = normalizeDataset(allData, outliers)

In [None]:
from sklearn.model_selection import KFold, LeaveOneOut

In [None]:
import random

In [None]:
def createRandomBaseline():
    randomlist = []
    for i in range(0, 30):
        n = random.randint(0,len(allData)-1)
        randomlist.append(n)
    return randomlist

In [None]:
# extractor of Kafka default values records
def baselineValues(df):
    return df[(df['BackgroundThreads']==10) & (df['NumNetworkThreads']==3) 
            & (df['NumIoThreads'] == 8) & (df['NumReplicaFetchers'] == 1)]

In [None]:
# Extract the baseline records from each summary dataset and concatenate them. 
# Note that baseline values are found only in Zone1 records
baselinesDF_initial = baselineValues(summaryL1Z1)
baselinesDF_initial = baselinesDF_initial.append(baselineValues(summaryL2Z1))
baselinesDF_initial = baselinesDF_initial.append(baselineValues(summaryL3Z1))

In [None]:
unscalable_vars = dropColumns.copy()
unscalable_vars.append(target_var)
to_scale_vars = [var for var in allData.columns if var not in unscalable_vars]
#to_scale_vars

In [None]:
def prepareTestDataset(scaler, targetColumnName, dropList, dataframe, outliersList):
    df = dataframe.copy()
    targetDF = df[[target_var]].reset_index(drop=True)
    
    df = df.drop(dropList, axis = 1)
    df = pd.concat([df[[target_var]].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df[to_scale_vars]), columns=to_scale_vars)],
                    axis=1)
    df = df.drop([targetColumnName], axis=1)
    return (targetDF, df)

In [None]:
if activationToUse == 'GELU':
    @tf.function
    def custom_activation(x):
        return 0.5*x*(1+tf.tanh(tf.sqrt(2/math.pi)*(x+0.044715*tf.pow(x, 3))))
else:
    custom_activation = 'relu'
    
from tensorflow.keras.utils import get_custom_objects
get_custom_objects()['custom_activation'] = custom_activation

In [None]:
columnList = allData.columns