In [1]:
import pandas as pd
import numpy as np
import copy

from scipy.stats import lognorm
import sys
import os

sys.path.append('../../../analyzing_serverless_in_the_wild')

import source.utilities as utilities 
import source.policy as policy

In [2]:
## TRAIN
def train(data, config):

    apps = data['HashApp']

    data = data.drop(columns=['HashApp'])
    data = np.array(data)

    result = policy.hybrid_policy(data, config)
    
    result = pd.DataFrame(result)
    result['HashApp'] = apps

    return result


## TEST/EVAL
def test(data, policy_result):

    hist_policy = policy_result[policy_result['Policy'] == 'Histogram']
    fixed_policy = policy_result[(policy_result['Policy'] == 'Fixed') | (policy_result['Policy'] == 'Arima')]
    # arima_policy = policy_result[policy_result['Policy'] == 'Arima']

    pre_warm = hist_policy[['HashApp','PreWarm']]
    keep_alive = hist_policy[['HashApp','KeepAlive']]
        
    apps = data['HashApp']
    data['HashApp'] = apps

    hist_policy_chunk = data.merge(hist_policy['HashApp'], on='HashApp', how='inner')

    pre_warm_chunk = pre_warm.merge(apps, on='HashApp', how='inner')
    pre_warm_chunk = pre_warm_chunk['PreWarm']

    keep_alive_chunk = keep_alive.merge(apps, on='HashApp', how='inner')
    keep_alive_chunk = keep_alive_chunk['KeepAlive']

    hist_policy_chunk.columns = ['HashApp'] + list(range(1, 1440*2 + 1))
    
    fixed_policy_chunk = data.merge(fixed_policy['HashApp'], on='HashApp', how='inner')
    fixed_policy_chunk.columns = ['HashApp'] + list(range(1, 1440*2 + 1))

    sim_result_hist = utilities.compute_simulation(hist_policy_chunk,
                                                prewarm_window=pre_warm_chunk,
                                                keep_alive_window=keep_alive_chunk)
    sim_result_fixed = utilities.compute_simulation(fixed_policy_chunk,
                                                prewarm_window=0,
                                                keep_alive_window=10)
    sim_result_arima = None
    

    return sim_result_hist, sim_result_fixed, sim_result_arima

In [3]:
dir_path = "chunk_invocation_data"
path_results = "simulation_results"
num_chunks = len(os.listdir(dir_path))
chunk_range = range(1, num_chunks + 1)

def train_test_hybrid_policy(config):
    policy_result_list = []
    unique_apps = set() # all hashes of apps that exist within the train dataset

    hist_results = []
    fixed_results = []
    # arima_results = []


    for chunk_id in chunk_range:
        chunk_filename = f'chunk_{chunk_id}.csv'
        file_path = os.path.join(dir_path, chunk_filename)
        data = pd.read_csv(file_path)
        
        policy_result = train(data.iloc[:,:-1440*2], config)

        unique_apps.update(policy_result['HashApp'])
        policy_result_list.append(policy_result)

        sim_hist, sim_fixed, sim_arima = test(pd.concat([data['HashApp'],data.iloc[:,-1440*2:]], axis=1), policy_result)
        
        hist_results.append(sim_hist)
        fixed_results.append(sim_fixed)
        # arima_results.append(sim_arima)


    policy_result = pd.concat(policy_result_list)
    hist_results = pd.concat(hist_results)
    fixed_results = pd.concat(fixed_results)
    return policy_result, hist_results, fixed_results
    # arima_results = pd.concat(arima_results)


def test_fixed_policy(keep_alive=10):
    fixed_results = []

    for chunk_id in chunk_range:
        chunk_filename = f'chunk_{chunk_id}.csv'
        file_path = os.path.join(dir_path, chunk_filename)
        data = pd.read_csv(file_path)
        subset = data.iloc[:, -1440*2:].copy()
        subset.columns = list(range(1, 1440*2 + 1))
        subset['HashApp'] = data['HashApp']

        sim_result_fixed = utilities.compute_simulation(subset,
                                                prewarm_window=0,
                                                keep_alive_window=keep_alive)
        
        fixed_results.append(sim_result_fixed)
        
    return pd.concat(fixed_results)

In [4]:
config = {
    'histogram_threshold': 4,
    'oob_threshold': 0.2,
    'cv_threshold': 2,
    'pctl_lower': 5,
    'pctl_upper': 99
}

In [5]:
## HYBRID PERCENTILES
result_list = []
for percentiles in [(0,100), (0,99), (5,100), (1,99), (5,99), (1,95), (5,95)]:
    filename = f'Hybrid_{percentiles}.csv'
    file_path = os.path.join(path_results, filename)
    
    config_perc = copy.deepcopy(config)
    config['pctl_lower'] = percentiles[0]
    config['pctl_upper'] = percentiles[1]
    policy_result, hist_results, fixed_results = train_test_hybrid_policy(config)
    sim_result = pd.concat([hist_results, fixed_results])
    sim_result.to_csv(file_path)

In [6]:
for keep_alive in [5,10,20,30,45,60,90,120,20_000]:
    filename = f'Fixed_{keep_alive}.csv'
    file_path = os.path.join(path_results, filename)

    sim_result = test_fixed_policy(keep_alive)
    sim_result.to_csv(file_path)

In [None]:
result_list = []
for cv in [0,2,5,10]:
    filename = f'Hybrid_CV_{cv}.csv'
    file_path = os.path.join(path_results, filename)

    config_cv = copy.deepcopy(config)
    config_cv['pctl_lower'] = 5
    config_cv['pctl_upper'] = 99
    config_cv['cv_threshold'] = cv
    policy_result, hist_results, fixed_results = train_test_hybrid_policy(config_cv)
    sim_result = pd.concat([hist_results, fixed_results])
    sim_result.to_csv(file_path)
    sim_result = sim_result[['ColdStartPercentage','WastedMemoryRatio']]

    plot_dict = {}
    plot_dict['DF'] = sim_result
    plot_dict['Label'] = f"CV={cv}"
    plot_dict['Linestyle'] = '-'

    result_list.append(plot_dict)

In [None]:
def calculate_weighted_average(histograms):
    bin_numbers = np.arange(1, histograms.shape[1] + 1)

    weighted_values = histograms * bin_numbers

    sum_weighted = weighted_values.sum(axis=1)

    sum_of_bins = histograms.sum(axis=1)

    with np.errstate(divide='ignore', invalid='ignore'):
        weighted_average = sum_weighted / sum_of_bins
        weighted_average = np.where(sum_of_bins != 0, weighted_average, 0)


    return weighted_average

w_avg_results = []

for chunk_id in chunk_range:
    chunk_filename = f'chunk_{chunk_id}.csv'
    file_path = os.path.join(dir_path, chunk_filename)
    data = pd.read_csv(file_path)
    subset = data.iloc[:, -1440*2:].copy()
    subset.columns = list(range(1, 1440*2 + 1))
    subset['HashApp'] = data['HashApp']

    subset_iit = subset.drop(columns=['HashApp'])
    subset_iit = np.array(subset)

    iit = utilities.compute_inter_invocation_times(subset_iit)
    hist = utilities.compute_histogram(iit)
    hist = hist[:,:240]
    w_avg = calculate_weighted_average(hist)

    keep_alive = np.round(w_avg)

    sim_result_avg = utilities.compute_simulation(subset,
                                            prewarm_window=0,
                                            keep_alive_window=keep_alive)
    w_avg_results.append(sim_result_avg)

w_avg_results = pd.concat(w_avg_results)
w_avg_results.to_csv('simulation_results/weighted_average.csv')

In [9]:
result_list = []
for window in [1,2,3,4]:
    filename = f'Hybrid_Window_{window}.csv'
    file_path = os.path.join(path_results, filename)
    
    config_window = copy.deepcopy(config)
    config_window['histogram_threshold'] = window
    config_window['pctl_lower'] = 5
    config_window['pctl_upper'] = 99
    policy_result, hist_results, fixed_results = train_test_hybrid_policy(config_window)
    sim_result = pd.concat([hist_results, fixed_results])
    sim_result.to_csv(file_path)
    sim_result = sim_result[['ColdStartPercentage','WastedMemoryRatio']]

In [10]:
## CODE FOR RUNNING ARIMA, NOT USED AS TRAINING MODELS TAKES A LONG TIME

# chunk_filename = 'chunk_1.csv'
# file_path = os.path.join(dir_path, chunk_filename)
# data = pd.read_csv(file_path)

# policy_result = train(data.iloc[:,:-1440*2])

# unique_apps.update(policy_result['HashApp'])

# arima_policy = policy_result[policy_result['Policy'] == 'Arima']
# arima_chunk = data.merge(arima_policy['HashApp'], on='HashApp', how='inner')
# arima_chunk_train = arima_chunk.iloc[:,:-1440*2]
# arima_chunk_test = arima_chunk.iloc[:,-1440*2:]
# arima_chunk_test['HashApp'] = arima_chunk_train['HashApp']

# apps = arima_chunk_train['HashApp']
# app = apps[0]

# print(app)
# # Get training data for the app
# train_app = arima_chunk_train[arima_chunk_train['HashApp'] == app]
# train_app = arima_chunk_train.drop(columns=['HashApp'])
# test_app = train_app.to_numpy().flatten()
# test_app

# # Run the function
# cold_starts, function_durations, keep_alive_times = utilities.train_test_arima(arima_chunk_train, arima_chunk_test)