# Performance Models and Impact with Compression of Job-Data with K-Means-Clustering

In [213]:
import sys
sys.path.insert(0, '..')
import collections

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from IPython.display import display, HTML
from ipywidgets import IntProgress

from RuntimePrediction.Predict import Predictor as C3OPredictor
from RuntimePrediction.DefaultModels import (GradientBoosting as GB,
                                             ErnestModel as EM)
from RuntimePrediction.CustomModels import (BasicOptimisticModel as BOM,
                                            OptimisticGradientBoosting as OGB)

## Extract Runtime Data

In [214]:
kmeans_df = pd.read_csv('../data/kmeans.tsv')
sort_df = pd.read_csv('../data/sort.tsv')
grep_df = pd.read_csv('../data/grep.tsv')
sgd_df = pd.read_csv('../data/sgd.tsv')
pagerank_df = pd.read_csv('../data/pagerank.tsv')

## 1. Compression: Filter and remove duplicate datapoints

In [215]:
kmeans_compression_0 = kmeans_df.shape[0]
sort_compression_0 = sort_df.shape[0]
grep_compression_0 = grep_df.shape[0]
sgd_compression_0 = sgd_df.shape[0]
pagerank_compression_0 = pagerank_df.shape[0]

kmeans_df.drop_duplicates(inplace=True)
sort_df.drop_duplicates(inplace=True)
grep_df.drop_duplicates(inplace=True)
sgd_df.drop_duplicates(inplace=True)
pagerank_df.drop_duplicates(inplace=True)

kmeans_df.to_csv('../data/reduced-kmeans_1.tsv', index=False)
sort_df.to_csv('../data/reduced-sort_1.tsv', index=False)
grep_df.to_csv('../data/reduced-grep_1.tsv', index=False)
sgd_df.to_csv('../data/reduced-sgd_1.tsv', index=False)
pagerank_df.to_csv('../data/reduced-pagerank_1.tsv', index=False)

kmeans_compression_1 = round(1 - (kmeans_df.shape[0])/kmeans_compression_0, 4)
sort_compression_1 = round(1 - (sort_df.shape[0])/sort_compression_0, 4)
grep_compression_1 = round(1 - (grep_df.shape[0])/grep_compression_0, 4)
sgd_compression_1 = round(1 - (sgd_df.shape[0])/sgd_compression_0, 4)
pagerank_compression_1 = round(1 - (pagerank_df.shape[0])/pagerank_compression_0, 4)

print("KMeans: First Compression: " + str(kmeans_compression_1) + " Datapoints: " + str(kmeans_df.shape[0]))
print("Sort: First Compression: " + str(sort_compression_1) + " Datapoints: " + str(sort_df.shape[0]))
print("Grep: First Compression: " + str(grep_compression_1) + " Datapoints: " + str(grep_df.shape[0]))
print("Sgd: First Compression: " + str(sgd_compression_1) + " Datapoints: " + str(sgd_df.shape[0]))
print("Pagerank: First Compression: " + str(pagerank_compression_1) + " Datapoints: " + str(pagerank_df.shape[0]))

KMeans: First Compression: 0.2733 Datapoints: 654
Sort: First Compression: 0.2873 Datapoints: 449
Grep: First Compression: 0.358 Datapoints: 520
Sgd: First Compression: 0.4578 Datapoints: 488
Pagerank: First Compression: 0.1631 Datapoints: 1180


## K-Means Clustering

In [217]:
# Ignore feature machine_type
kmeans_machine_type_column= pd.read_csv('../data/reduced-kmeans_1.tsv', sep='\t', usecols=range(1,2))
sort_machine_type_column= pd.read_csv('../data/reduced-sort_1.tsv', sep='\t', usecols=range(1,2))
grep_machine_type_column= pd.read_csv('../data/reduced-grep_1.tsv', sep='\t', usecols=range(1,2))
sgd_machine_type_column= pd.read_csv('../data/reduced-sgd_1.tsv', sep='\t', usecols=range(1,2))
pagerank_machine_type_column= pd.read_csv('../data/reduced-pagerank_1.tsv', sep='\t', usecols=range(1,2))

kmeans_headers = [*pd.read_csv('../data/reduced-kmeans_1.tsv', sep='\t', nrows=1)]
reducedKmeans = pd.read_csv('../data/reduced-kmeans_1.tsv', sep='\t', usecols=[c for c in kmeans_headers if c != 'machine_type'])

sort_headers = [*pd.read_csv('../data/reduced-sort_1.tsv', sep='\t', nrows=1)]
reducedSort = pd.read_csv('../data/reduced-sort_1.tsv', sep='\t', usecols=[c for c in sort_headers if c != 'machine_type'])

grep_headers = [*pd.read_csv('../data/reduced-grep_1.tsv', sep='\t', nrows=1)]
reducedGrep = pd.read_csv('../data/reduced-grep_1.tsv', sep='\t', usecols=[c for c in grep_headers if c != 'machine_type'])

sgd_headers = [*pd.read_csv('../data/reduced-sgd_1.tsv', sep='\t', nrows=1)]
reducedSgd = pd.read_csv('../data/reduced-sgd_1.tsv', sep='\t', usecols=[c for c in sgd_headers if c != 'machine_type'])

pagerank_headers = [*pd.read_csv('../data/reduced-pagerank_1.tsv', sep='\t', nrows=1)]
reducedPagerank = pd.read_csv('../data/reduced-pagerank_1.tsv', sep='\t', usecols=[c for c in pagerank_headers if c != 'machine_type'])
#print(reducedKmeans)

In [201]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score

# Make Clusters with KMeans
km = KMeans(n_clusters=30, random_state=0).fit(reducedKmeans)
cluster_map = pd.DataFrame()
#cluster_map['data_index'] = reducedKmeans.index.values
cluster_map['gross_runtime'] = reducedKmeans['gross_runtime']
cluster_map['cluster'] = km.labels_

# Calculate Silhoutte Score for optimal n_cluster
score = silhouette_score(reducedKmeans, km.labels_, metric='euclidean')
print('Silhouetter Score: %.3f' % score)

# Get Centroids of the current clusters
central, _ = pairwise_distances_argmin_min(km.cluster_centers_, reducedKmeans)
#print(central)

# Just filter the centroid rows of current dataset and add feature machine_type back
dataset = pd.read_csv("../data/reduced-kmeans_1.tsv")
machine_type_column.join(dataset)
filter_centroids = dataset.iloc[central].sort_index(0, ascending=True)
filter_centroids.to_csv('../data/reduced-kmeans_2.tsv', index=False)
#print(filter_centroids)

Silhouetter Score: 0.934


## Define Models

In [220]:
Model = collections.namedtuple('Model', ['name', 'predictor', 'kwargs'])

models = [
          Model('Ernest', EM, {}),
          Model('GBM', GB, {}), 
          
          Model('BOM', BOM, {}), 
          Model('OGB', OGB, {}), 
   
          Model('C3O', C3OPredictor, {}),
         ]

## Extract Runtime Data

In [221]:
kmeans_df = pd.read_csv('../data/reduced-kmeans_1.tsv', sep='\t')
sort_df = pd.read_csv('../data/reduced-sort_1.tsv', sep='\t')
grep_df = pd.read_csv('../data/reduced-grep_1.tsv', sep='\t')
sgd_df = pd.read_csv('../data/reduced-sgd_1.tsv', sep='\t')
pagerank_df = pd.read_csv('../data/reduced-pagerank_1.tsv', sep='\t')

In [222]:
Job = collections.namedtuple('Job', ['name', 'X', 'y'])

def get_training_data(df, features, filters):
    # Get medians
    g = df.groupby(by=['instance_count','machine_type']+features)
    df = pd.DataFrame(g.median().to_records())
    # Apply filters
    # e.g. only for one machine type each, the full c3o-experiments were conducted
    # No full cartesian product!
    for k, s, v in filters:
        if s == '==': df = df[df[k] == v]
        if s == '>' : df = df[df[k] >  v]
    X = df[['instance_count'] + features]
    y = (df[['gross_runtime']]).squeeze()
    return X, y

td = get_training_data
jobs = [
    Job('Sort',
        *(td(sort_df,
            ['data_size_MB'],
            [('machine_type', '==', 'c4.2xlarge'),
             ('line_length', '==', 100)] )) ),
    Job('Grep',
        *(td(grep_df,
            ['data_size_MB', 'p_occurrence'],
            [('machine_type', '==', 'm4.2xlarge')] )) ),
    Job('SGDLR',
        *(td(sgd_df,
            ['observations', 'features', 'iterations'],
            [('machine_type', '==', 'r4.2xlarge'),
             ('instance_count', '>', 2)] )) ),
    Job('K-Means',
        *(td(kmeans_df,
            ['observations', 'features', 'k'],
            [('machine_type', '==', 'r4.2xlarge'),
             ('instance_count', '>', 2)] )) ),
    Job('Page Rank',
        *(td(pagerank_df,
            ['links', 'pages', 'convergence_criterion'],
            [('machine_type', '==', 'r4.2xlarge')] )) ),
]

## Train-Test Split Creations

In [223]:
def create_partial_training_data(job, splits):
    for _ in range(splits):
        yield train_test_split(job.X,job.y)

### Calculating the Model Prediction Errors

In [224]:
def evaluate(model, X_train, X_test, y_train, y_test, error_metric):
    model_instance = model.predictor(**model.kwargs)
    model_instance.fit(X_train, y_train)
    y_pred = model_instance.predict(X_test)
    error = error_metric(y_test, y_pred)
    return error

In [225]:
def create_new_compression_evaluations(amount):
    with open('compression_results.csv', 'at') as f:
        for job in jobs:
            for split in create_partial_training_data(job, splits=amount):
                for model in models:
                    error = evaluate(model, *split, MAPE)
                    # TODO: if job.name than set compression
                    if job.name == 'K-Means':
                        f.write(f"{job.name},{model.name},{kmeans_compression_1},{error}\n")
                    if job.name == 'Sort':
                        f.write(f"{job.name},{model.name},{sort_compression_1},{error}\n")
                    if job.name == 'Grep':
                        f.write(f"{job.name},{model.name},{grep_compression_1},{error}\n")
                    if job.name == 'SGDLR':
                        f.write(f"{job.name},{model.name},{sgd_compression_1},{error}\n")
                    if job.name == 'Page Rank':
                        f.write(f"{job.name},{model.name},{pagerank_compression_1},{error}\n")

In [226]:
create_new_compression_evaluations(amount=0)

### Display the Results

In [227]:
def read_results(job_name):
    try: results = pd.read_csv('compression_results.csv', header=None)
    except: return

    rres = [(job, model, ds, float(mape)) for job, model, ds, mape in results.values]
    df = pd.DataFrame(rres, columns = ('Job', 'Model', 'Compression', 'MAPE'))
    groups = df.groupby(by=['Job', 'Model', 'Compression'], as_index=False)
    df = groups.mean()
    # Filter info for just the job we are interested in
    jobdf = df[df['Job'] == job_name].set_index(['Model', 'Compression'])
    # Remove redundant info
    return jobdf.unstack()['MAPE']

def display_job_evaluation(job_name):
    
    try: displaydf = read_results(job_name)
    except: display(HTML(f"<h3>{job_name}</h3> None")); return 
    
    # Style the output to highlight the important information
    def highlight_row_min(row):
        return ['color: black' if cell == min(row) else 'color: dimgray' for cell in row]
    
    def highlight_min(data, color='aquamarine', bold=False):
        # highlight the minimum in a Series or DataFrame
        attr1 = f"background-color: {color}"
        attr2 = f"font-weight: {'bold' if bold else 'normal'}"
        if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
            is_min = data == data.min()
            return [attr1 if v else '' for v in is_min]
        else:  # from .apply(axis=None)
            is_min = data == data.min().min()
            return pd.DataFrame(np.where(is_min, attr2, ''),
                                index=data.index, columns=data.columns)
        
    table_html = displaydf.style.apply(highlight_row_min,axis=1) \
                                .apply(highlight_min, axis=0, color='aquamarine') \
                                .apply(highlight_min, axis=None, bold=True) \
                                .format("{:.2%}") \
                                ._repr_html_()
    
    display(HTML(f"<h3>{job_name}</h3>" + table_html ))

### Performance of the C3O Predictor and its Constituent Models

In [228]:
for job in jobs:
    display_job_evaluation(job.name)

Compression,0.0,0.2873
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOM,6.31%,6.22%
C3O,4.65%,3.93%
Ernest,5.48%,6.09%
GBM,6.39%,7.23%
OGB,3.31%,2.79%


Compression,0.0,0.358
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOM,15.30%,18.57%
C3O,3.80%,3.31%
Ernest,39.16%,38.08%
GBM,3.76%,3.31%
OGB,10.61%,14.77%


Compression,0.0,0.4578
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOM,13.16%,11.62%
C3O,2.81%,2.99%
Ernest,22.30%,21.71%
GBM,2.81%,2.99%
OGB,7.88%,6.42%


Compression,0.0,0.2733
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOM,5.57%,5.50%
C3O,2.69%,2.71%
Ernest,15.96%,14.29%
GBM,2.70%,2.70%
OGB,5.18%,5.10%


Compression,0.0,0.1631
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOM,13.52%,15.30%
C3O,2.86%,2.95%
Ernest,33.18%,34.84%
GBM,2.92%,3.00%
OGB,3.11%,3.75%


In [None]:
ff = 'DejaVu Sans'
ff = 'DejaVu Serif'
matplotlib.rc('font', family=ff)

model_names = list(map(lambda m:m.name, models))

colors = ['tab:blue', 'tab:green', 'red', 'tab:orange', 'black']

plt.figure(figsize=(8,8))
for i, job in enumerate(jobs):
    results = read_results(job.name)
    plt.subplot(3,2,i+1)
    plt.title(job.name, fontsize=17)
    for j, model_name in enumerate(model_names):
        y = results.loc[model_name][:]
        x =  results.loc[model_name].keys()
        line = plt.plot(x, y, '--' if model_name=='C3O' else '-', label=model_name, color=colors[j])
            
        plt.xlabel('Kompression', fontsize=15)
        plt.ylabel('MAPE', fontsize=15)
        
        plt.ylim(0, .8)

plt.figlegend(model_names, fontsize=15, loc='lower center', ncol=1, bbox_to_anchor=[0.78, +.07], 
              bbox_transform=plt.gcf().transFigure)
plt.tight_layout()
plt.savefig('availability.pdf', bbox_inches='tight')
plt.show()