In [None]:
import pandas as pd
from plotnine import *
import os

In [None]:
entry_directory = "Raw"
prepared_directory = "Prepared"
organised_directory = "Organised"

## standardise joined files for each udf type

In [None]:
%matplotlib inline
udf_types = ['aggregation', 'filtration', 'filtration-aggregation', 'filtration-aggregation-join', 'filtration-join']
for udf_type in udf_types:
  
  full_df = pd.read_csv(f"{organised_directory}/{udf_type}/joined_{udf_type}.csv")
  means_df = full_df[["CPU", "RAM"]].groupby(full_df["snapshot"]).mean().rename(columns={"CPU" : "Mean_CPU", "RAM" : "Mean_RAM"}).reset_index()
  std_df = full_df[["CPU", "RAM"]].groupby(full_df["snapshot"]).std().rename(columns={"CPU" : "Std_CPU", "RAM" : "Std_RAM"}).reset_index()
  full_df = full_df.join(means_df, on="snapshot", rsuffix="_means",  how ="left").join(std_df, on="snapshot", rsuffix="_stds", how ="left")
  full_df["CPU"] = (full_df["CPU"]-full_df["Mean_CPU"])/full_df["Std_CPU"]
  full_df["RAM"] = (full_df["RAM"]-full_df["Mean_RAM"])/full_df["Std_RAM"]

  if not os.path.exists(f"{organised_directory}/{udf_type}"):
      os.makedirs(f"{organised_directory}/{udf_type}")
  full_df[["snapshot", "label", "udf", "epoch", "CPU", "RAM"]].to_csv(f"./{organised_directory}/{udf_type}/standardised_{udf_type}.csv", index = False)


## normalize joined files for each udf type

In [None]:
%matplotlib inline
udf_types = ['aggregation', 'filtration', 'filtration-aggregation', 'filtration-aggregation-join', 'filtration-join']
full_df = pd.DataFrame()
for udf_type in udf_types:
    full_df = pd.concat([full_df, pd.read_csv(f"{organised_directory}/{udf_type}/joined_{udf_type}.csv")])

full_df.CPU = (full_df.CPU-full_df.CPU.min())/(full_df.CPU.max() - full_df.CPU.min())
full_df.RAM = (full_df.RAM-full_df.RAM.min())/(full_df.RAM.max() - full_df.RAM.min())
for udf_type in udf_types:
    if not os.path.exists(f"{organised_directory}/{udf_type}"):
        os.makedirs(f"{organised_directory}/{udf_type}")
    full_df[full_df.label == udf_type].to_csv(f"./{organised_directory}/{udf_type}/normalized_{udf_type}.csv", index = False)