In [48]:
import pandas as pd
import os

directory = '../results/simulation/evaluation_jobs'

work_phase_index_to_name = {
    '0': 'balanced',
    '1': 'long-high',
    '2': 'short-high',
}

columns = [
    'filename',
    'scheduling_policy',
    'work_type',
    'work_phases',
    'startup_length',
    'startup_power',
    'waiting_time',
    'id',
    'arrival_time',
    'length',
    'carbon_cost',
    'start_time',
    'waiting_time',
    'exit_time'
]

all_results = pd.DataFrame([])

print(len(os.listdir(directory)))

def read_trace(filename: str): 
        # this is based on the mapping in generate_evaluation_jobs.sh
        parameters = filename.split('_')

        parameter_dict = {
            'scheduling_policy': parameters[0],
            'work_type': parameters[1],
            'work_phases': work_phase_index_to_name[parameters[2]],
            'startup_length': parameters[3],
            'startup_power': parameters[4],
            'waiting_time': parameters[5],
        }

        df = pd.read_csv(f"{directory}/{filename}")
        df = df.drop(df.index[-1])

        if (len(df) < 9):
            print(filename)

        for key, value in parameter_dict.items():
            df[key] = value

        df['filename'] = filename

        return df


# List all files in the directory
for filename in os.listdir(directory):
    if os.path.isfile(os.path.join(directory, filename)):
        parameters = filename.split('_')

        if parameters[-1] != 'details':
            continue

        df = read_trace(filename)

        all_results = pd.concat([
            all_results, 
            df
        ], ignore_index=True)

print(all_results.head())
print(f"Read {len(all_results)} entries")

960
carbon_periodic-phases_1_300_100_96_details
suspend-resume_periodic-phases_2_0_100_96_details
carbon_periodic-phases_2_0_200_12_details
suspend-resume_constant-from-periodic-phases_0_0_100_96_details
carbon_constant-from-periodic-phases_1_300_200_48_details
carbon_constant-from-periodic-phases_0_600_200_6_details
carbon_constant-from-periodic-phases_2_600_200_96_details
suspend-resume_periodic-phases_1_600_100_6_details
carbon_periodic-phases_1_600_100_48_details
carbon_constant-from-periodic-phases_2_0_200_96_details
carbon_constant-from-periodic-phases_1_0_200_48_details
suspend-resume_constant-from-periodic-phases_2_0_200_12_details
suspend-resume_periodic-phases_2_1800_200_6_details
suspend-resume_periodic-phases_2_600_100_96_details
suspend-resume_constant-from-periodic-phases_0_1800_100_48_details
carbon_constant-from-periodic-phases_2_300_200_48_details
carbon_constant-from-periodic-phases_1_300_200_6_details
carbon_constant-from-periodic-phases_2_600_100_96_details
carbon_c

In [49]:
import plotly.express as px
import plotly.io as pio

"""
Let's first compare the same job across different scheduling approaches,
deducing how much carbon is emitted under each scheduler
"""

new_id_mapping = {
    '0': 1,
    '1': 4,
    '2': 7,
    '3': 2,
    '4': 5,
    '5': 8, 
    '6': 3,
    '7': 6,
    '8': 9
}

all_results["ID_new"] = all_results["ID"].apply(lambda id: new_id_mapping[str(id)])

same_job_different_schedulers = all_results.groupby(["ID_new", "startup_length", "startup_power", "waiting_time", "work_type", "work_phases", "arrival_time"])


same_job_different_schedulers_plot_df = pd.DataFrame([])
savings_df = pd.DataFrame([])


had_savings = 0
index = 0

for category, group_df in same_job_different_schedulers:

    group_df["job_index"] = index

    carbon_cost_non_interrupted = group_df[group_df["scheduling_policy"] == "carbon"]["carbon_cost"].sum()
    carbon_cost_suspend_resume = group_df[group_df["scheduling_policy"] == "suspend-resume"]["carbon_cost"].sum()

    dummy_dict = group_df[group_df["scheduling_policy"] == "carbon"].iloc[0].to_dict()

    min_delta = 0.01

    if (abs(carbon_cost_non_interrupted - carbon_cost_suspend_resume) < min_delta):


        same_job_different_schedulers_plot_df = pd.concat([
            same_job_different_schedulers_plot_df,
            pd.DataFrame([{
                **dummy_dict,
                "job_index": index, 
                "carbon_cost":carbon_cost_non_interrupted, 
                "scheduling_policy": f"Δ < {min_delta}"}])
        ])

    else:
        savings_df = pd.concat([
            savings_df,
            pd.DataFrame([{
                **dummy_dict,
                "carbon_cost_suspend_resume": carbon_cost_suspend_resume,
                "carbon_cost_non_interrupted": carbon_cost_non_interrupted,
                "delta_schedulers": carbon_cost_suspend_resume - carbon_cost_non_interrupted
            }])
        ])

        had_savings += 1
        same_job_different_schedulers_plot_df = pd.concat([
            same_job_different_schedulers_plot_df,
            group_df
        ])

    index += 1

print(f"Out of {index} groups, {had_savings} had savings between the schedulers")

IndexError: single positional indexer is out-of-bounds

In [31]:
"""
Also try to compare the impact of having phase information
"""

same_job_different_phases = all_results.groupby(["ID_new", "startup_length", "startup_power", "waiting_time", "scheduling_policy" , "work_phases", "arrival_time"])

same_job_different_phases_df = pd.DataFrame([])
phases_savings_df = pd.DataFrame([])


had_savings = 0
index = 0

for category, group_df in same_job_different_phases:

    group_df["job_index"] = index
    carbon_cost_non_constant = group_df[group_df["work_type"] == "periodic-phases"]["carbon_cost"].sum()
    carbon_cost_constant = group_df[group_df["work_type"] == "constant-from-periodic-phases"]["carbon_cost"].sum()

    dummy_dict = group_df[group_df["work_type"] == "periodic-phases"].iloc[0].to_dict()

    min_delta = 0.01

    if (abs(carbon_cost_non_constant - carbon_cost_constant) < min_delta):
        same_job_different_phases_df = pd.concat([
            same_job_different_phases_df,
            pd.DataFrame([{
                **dummy_dict,
                "job_index": index, 
                "carbon_cost": carbon_cost_constant, 
                "work_type": f"Δ < {min_delta}"}])
        ])

    else:
        # same_job_different_phases_df = pd.concat([
        #     same_job_different_phases_df,
        #     pd.DataFrame([{
        #         **dummy_dict,
        #         "carbon_cost_constant": carbon_cost_constant,
        #         "carbon_cost_non_constant": carbon_cost_non_constant,
        #         "delta": carbon_cost_constant - carbon_cost_non_constant
        #     }])
        # ])

        had_savings += 1
        same_job_different_phases_df = pd.concat([
            same_job_different_phases_df,
            group_df
        ])

    index += 1

print(f"Out of {index} groups, {had_savings} had savings between having phase information")
print(savings_df)

Out of 1728 groups, 1106 had savings between having phase information
    ID  arrival_time  length  cpus length_class  resource_class  carbon_cost  \
0    3          1800    3600     1          0-2             1.0     8.680000   
0    3          1800    3600     1          0-2             1.0     8.680000   
0    3          1800    3600     1          0-2             1.0     8.680000   
0    3          1800    3600     1          0-2             1.0     8.680000   
0    3          1800    3600     1          0-2             1.0     8.680000   
..  ..           ...     ...   ...          ...             ...          ...   
0    8          3600   10800     1          2-6             1.0    44.928167   
0    8          3600   10800     1          2-6             1.0    47.213167   
0    8          3600   10800     1          2-6             1.0    38.355333   
0    8          3600   10800     1          2-6             1.0    29.433333   
0    8          3600   10800     1          2-6   

In [32]:
"""
Scrolling through by eye, there are some cases where the suspend-resume strategy performed 
better but there are also some cases where it performed worse.

Lets do a graph just plotting each experiment, with the carbon emissions on the y axis
"""

def make_deadlines_plot(df: pd.DataFrame, target_col: str, labels: dict):
    fig = px.scatter(
        df, x="job_index", y="carbon_cost", color=target_col,
        hover_data=df.columns, labels=labels
    )


    fig.update_layout({
        "xaxis_title":"Job ID",
        "yaxis_title":"Carbon emissions in kg",
        "xaxis_range": [0, df["job_index"].max()]
    })


    # add lines for the length of the jobs
    fig.add_vline(x = 576)
    fig.add_annotation(x= 576 / 2, y=35, text="1 hour jobs", showarrow=False)

    fig.add_vline(x = 1163)
    fig.add_annotation(x= 700, y=45, text="2 hour jobs", showarrow=False)

    fig.add_annotation(x= 1300, y=15, text="3 hour jobs", showarrow=False)

    fig.update_layout(legend ={
        "yanchor":"top",
        "y":0.96, # equal spacing ;)
        "xanchor":"left",
        "x":0.01
    })

    fig.show()
    return fig

same_job_different_schedulers_plot_df["scheduling_policy"] = same_job_different_schedulers_plot_df["scheduling_policy"].apply(lambda label: "without resuming" if label == "carbon" else label)

same_job_different_schedulers_fig = make_deadlines_plot(same_job_different_schedulers_plot_df, "scheduling_policy", {"scheduling_policy":"Scheduled Jobs"})

pio.write_image(same_job_different_schedulers_fig, file="eval_same_job_different_schedulers.pdf")

In [33]:
"""
Plot Carbon-savings form having carbon information or not
"""

label_mapping = {
    "constant-from-periodic-phases": "averaged constant",
    "periodic-phases": "phases"
}

same_job_different_phases_df["work_type"] = same_job_different_phases_df["work_type"].apply(lambda label: label_mapping[label] if label in label_mapping else label) # xd

same_job_different_phases_fig = make_deadlines_plot(same_job_different_phases_df, "work_type", {"work_type":"Scheduled Jobs"})
pio.write_image(same_job_different_phases_fig, file="eval_same_job_different_phases.pdf")

In [42]:
"""
Effect of increased waiting times?
"""

import plotly.express as px
import plotly.io as pio

"""
Let's first compare the same job across different scheduling approaches,
deducing how much carbon is emitted under each scheduler
"""

same_job_different_deadlines_df = pd.DataFrame([])

index = 0

waiting_times = all_results["waiting_time"].unique()
print(waiting_times)

for category, group_df in all_results.groupby(["ID_new", "startup_length", "startup_power", "scheduling_policy", "work_type", "work_phases", "arrival_time"]):

    group_df["job_index"] = index

    for waiting_time_to_sum in waiting_times:
        total_emissions = group_df[group_df["waiting_time"] == waiting_time_to_sum]["carbon_cost"].sum()

        dummy_dict = group_df[group_df["waiting_time"] == waiting_time_to_sum].iloc[0]

        same_job_different_deadlines_df = pd.concat([
            same_job_different_deadlines_df,
            pd.DataFrame([{**dummy_dict, "carbon_cost": total_emissions}])
        ])

    index += 1

print(same_job_different_deadlines_df.head())

['12' '48' '4' '24']
   ID  arrival_time  length  cpus length_class  resource_class  carbon_cost  \
0   0             0    3600     1          0-2             1.0       11.218   
0   0             0    3600     1          0-2             1.0       11.218   
0   0             0    3600     1          0-2             1.0       19.228   
0   0             0    3600     1          0-2             1.0       11.218   
0   0             0    3600     1          0-2             1.0       11.218   

   dollar_cost  start_time waiting_time  exit_time     reason  \
0       0.0624       28800           12      32400  completed   
0       0.0624       28800           48      32400  completed   
0       0.0624           0            4       3600  completed   
0       0.0624       28800           24      32400  completed   
0       0.0624       28800           12      32400  completed   

  scheduling_policy                      work_type work_phases startup_length  \
0            carbon  constant-fr

In [47]:
def make_deadlines_plot(df: pd.DataFrame):
    fig = px.scatter(
        df, x="job_index", y="carbon_cost", color="waiting_time",
        hover_data=df.columns,
    )

    fig.update_layout({
        "xaxis_title":"Job ID",
        "yaxis_title":"Carbon emissions in kg",
        "xaxis_range": [0, df["job_index"].max()]
    })


    fig.add_vline(x = 287)
    fig.add_annotation(x= 200 / 2, y=35, text="1 hour jobs", showarrow=False)

    fig.add_vline(x = 576)
    fig.add_annotation(x= 500, y=45, text="2 hour jobs", showarrow=False)

    fig.add_annotation(x= 700, y=15, text="3 hour jobs", showarrow=False)

    fig.update_layout(legend ={
        "yanchor":"top",
        "y":0.96, # equal spacing ;)
        "xanchor":"left",
        "x":0.01
    })

    fig.show()
    return fig

same_jobs_different_deadlines_fig = make_deadlines_plot(same_job_different_deadlines_df)
pio.write_image(same_job_different_phases_fig, file="eval_same_job_different_deadlines.pdf")


from plotly.subplots import make_subplots
from datetime import datetime
import pytz
import sys

sys.path.append('../src/')

import carbon

"""
For debugging purposes, plot some schedules, to see whats up
"""

traces_to_plot = ["suspend-resume_constant-from-periodic-phases_0_600_200_4_details", "carbon_constant-from-periodic-phases_0_600_200_4_details"]

carbon_trace = carbon.get_carbon_model("AU-SA", 7000, extra_columns=True)
start_date_in_carbon_trace_as_timestamp = carbon_trace.df.iloc[0]["timestamp"]

fig_carbon = px.scatter(carbon_trace.df, x='datetime', y="carbon_intensity_avg", color="carbon_intensity_avg", color_continuous_scale=px.colors.sequential.speed)

def time_to_dates(seconds_since_simulation_start) -> str:
    adjusted_timestamp = seconds_since_simulation_start + start_date_in_carbon_trace_as_timestamp
    date = datetime.fromtimestamp(adjusted_timestamp, pytz.timezone('UTC'))
    return date 

def plot_trace(filename: str):
    df = read_trace(filename)

    df["start_time_date"] = df["start_time"].apply(time_to_dates)
    df["submission_date"] = df["arrival_time"].apply(time_to_dates)
    # df["deadline"] = (df["arrival_time"] + (int(waiting_time) * 3600)).apply(time_to_dates)
    df["exit_time_date"] = df["exit_time"].apply(time_to_dates)

    min_date_in_trace = time_to_dates(df["start_time"].min())
    max_date_in_trace = time_to_dates(df["exit_time"].max())

    fig_gantt = px.timeline(df, x_start="start_time_date", x_end="exit_time_date", y="ID", hover_data=["start_time", "arrival_time"])

    submission_markers = []

    for row in df.itertuples(index=False):
        submission_markers.append({'type': 'line', 'x0': row.submission_date, 'x1': row.start_time_date, 'y0': row.ID, 'y1': row.ID, 'xref': 'x1', 'yref':'y1', 'line': dict(color="MediumPurple", width=2, dash="dot")})

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

    fig.add_trace(fig_gantt.data[0], row=1, col=1)
    fig.add_trace(fig_carbon.data[0], row=2, col=1)

    fig.update_layout(
        title_text = f"schedule of {filename}",
        xaxis=dict(
            type='date',
        ),
        xaxis2=dict(
            type='date'
        ),
        shapes=submission_markers
    )
    fig.update_xaxes(title_text="Date", range=[min_date_in_trace, max_date_in_trace])

    fig.update_yaxes(title_text="Job ID", fixedrange=True, row=1, col=1)
    fig.update_yaxes(title_text="Carbon intensity in gCO₂eq/kWh", fixedrange=True, row=2, col=1)

    yaxis2 = fig.layout.yaxis2

    fig.update_layout({'yaxis': {'range': [-0.5,df['ID'].max() + 1], 'tickmode': 'linear'}})
    fig.update_layout({'yaxis2': {'range': [0,0.5]}})
    fig.show()

for sussy_files in traces_to_plot:
    plot_trace(sussy_files)