In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import kruskal
from itertools import product

In [None]:
df1=pd.read_csv('carbon_cpu_bl.csv')
df2=pd.read_csv('carbon_gpu_bl.csv')

df_merged = pd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
def get_plots(df,col,group='method'):
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x=col, hue=group, element="step", kde=True)
    plt.title('Histogram')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    
    plt.show()
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=group, y=col)
    plt.title('Boxplot')
    plt.xlabel(group)
    plt.ylabel(col)
    plt.show()

In [None]:
cols=['emissions','emissions_rate', 'cpu_power', 'gpu_power', 'cpu_energy',
       'gpu_energy', 'ram_energy', 'energy_consumed']

In [None]:
for col in cols:
    get_plots(df_merged,col)
    get_plots(df_merged,'running_method')
    get_plots(df_merged,'running')
    

In [None]:
def get_scatterplot(df,x,y,group='method'):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=x, y=y, hue=group)
    plt.title('Scatter plot')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.legend(title=group)
    plt.show()

In [None]:
get_scatterplot(df_merged,'duration','emissions')

In [None]:
get_scatterplot(df_merged,'duration','energy_consumed')

In [None]:
get_scatterplot(df_merged,'cpu_energy','gpu_energy')

In [None]:
get_scatterplot(df_merged,'cpu_energy','ram_energy')

In [None]:
get_scatterplot(df_merged,'gpu_energy','ram_energy')

In [None]:
## anova test
model = ols('emissions_rate ~ C(method) + C(running_method) + C(running) + C(stage)', data=df_merged).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

In [None]:
## anova with blocks
def anova_with_blocks(df, test_var, block_vars):
    formula = f'emissions_rate ~ C({test_var})'
    for var in block_vars:
        formula += f' + C({var})'

    model = ols(formula, data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    return anova_table

In [None]:
block_vars = ['running_method', 'stage', 'running']
anova_results = anova_with_blocks(df_merged, 'method', block_vars)
print(anova_results)

In [None]:
block_vars = ['method', 'stage', 'running']
anova_results = anova_with_blocks(df_merged, 'running_method', block_vars)
print(anova_results)

In [None]:
block_vars = ['method', 'running_method', 'running']
anova_results = anova_with_blocks(df_merged, 'stage', block_vars)
print(anova_results)

In [None]:
block_vars = ['method', 'running_method', 'stage']
anova_results = anova_with_blocks(df_merged, 'running', block_vars)
print(anova_results)

In [None]:
##Kruskal Wallis test
methods = ['bloom_filter', 'disjoint_Ada_BF', 'learned_bf', 'Ada_BF']
running_methods = ['seq', 'parallel']
stages = ['training', 'test']
runnings = ['cpu', 'gpu']

def kruskal_wallis_test(df,category, groups):
    data_groups = [df[df[category] == group]['emissions_rate'] for group in groups]
    stat, p = kruskal(*data_groups)
    print(f"Kruskal-Wallis test for {category}: H-statistic = {stat:.3f}, p-value = {p:.3f}")

#test for each category
kruskal_wallis_test(df_merged,'method', methods)
kruskal_wallis_test(df_merged,'running_method', running_methods)
kruskal_wallis_test(df_merged,'stage', stages)
kruskal_wallis_test(df_merged,'running', runnings)

In [None]:
def get_kruskal_test(blocking_vars, test_var, test_categories):
    for values in product(*blocking_vars.values()):
        conditions = [(df[var] == value) for var, value in zip(blocking_vars.keys(), values)]
        filtered_df = df[conditions[0]]
        for condition in conditions[1:]:
            filtered_df = filtered_df[condition]
        if len(filtered_df) > 1:
            groups = [filtered_df[filtered_df[test_var] == cat]['emissions_rate'] for cat in test_categories]
            if all(len(group) > 1 for group in groups):  # Ensure enough data in each group
                stat, p = kruskal(*groups)
                print(f'Kruskal-Wallis Test for {test_var} with blocking variables {values}:')
                print(f'H-statistic = {stat:.3f}, p-value = {p:.3f}\n')

blocking_vars_list = [
    {'running_method': running_methods, 'stage': stages, 'running': runnings},
    {'method': methods, 'stage': stages, 'running': runnings},
    {'method': methods, 'running_method': running_methods, 'running': runnings},
    {'method': methods, 'running_method': running_methods, 'stage': stages}
]

test_vars = ['method', 'running_method', 'stage', 'running']
test_categories_list = [methods, running_methods, stages, runnings]

for blocking_vars, test_var, test_categories in zip(blocking_vars_list, test_vars, test_categories_list):
    get_kruskal_test(blocking_vars, test_var, test_categories)