In [None]:
from glob import glob
import pandas as pd
emission_rate = 485 #gCO2/kWh Icha, P., Lauf, T., & Kuhs, G. (2022). Entwicklung der spezifischen Treibhausgas-Emissionen des deutschen Strommix in den Jahren 1990—2021. Umweltbundesamt. https://www.umweltbundesamt.de/publikationen/entwicklung-der-spezifischen-kohlendioxid-8


In [None]:
path = 'PATH/Results*.csv'
df_list = list()
for f in glob(path):
    df_list.append(pd.read_csv(f))
df = pd.concat(df_list)
df.Query = df.Query.apply(lambda x: x.replace('_',' '))
df['Train_emissions'] = df.Train_energy * emission_rate
df['Query_emissions'] = df.Query_energy * emission_rate
df['Emissions (gr)'] = (df.Train_emissions + df.Query_emissions)
df = df[df.Percent <= 10.5]
df.to_csv('PATH/ActiveVisuals.csv', index = False)
path = 'PATH/Baseline*.csv'
df_list = list()
for f in glob(path):
    df_list.append(pd.read_csv(f))
df = pd.concat(df_list)
df.to_csv('PATH/ActiveVisualsBaseline.csv', index = False)

In [5]:
def get_data_size(path:str = 'PATH/data/'):
    path += '*.csv'
    dict_list = list()
    for file in glob(path):
        name = file.split('/')[-1].split('.')[0]
        df = pd.read_csv(file)
        size = len(df)
        dict_list.append(dict(
            Data = name,
            Size = size,  
        ))
    df = pd.DataFrame(dict_list)
    return df

def add_baseline(dataset, metric, path = 'PATH/ActiveLearner/'):
    emission_rate = 485
    path += 'Baseline*.csv'
    df_list = list()
    for f in glob(path):
        df_list.append(pd.read_csv(f))
    df = pd.concat(df_list)
    df['Baseline_Emissions'] = df['Train_energy'] * emission_rate
    try:
        return df.loc[df.Data == dataset, metric][0]
    except KeyError:
        return None
    
def get_stats(path = 'PATH/'):
    path += 'Results*.csv'
    df_list = list()
    for f in glob(path):
        df_list.append(pd.read_csv(f))
    df = pd.concat(df_list)
    df.Query = df.Query.apply(lambda x: x.replace('_',' '))
    df['Train_emissions'] = df.Train_energy * emission_rate
    df['Query_emissions'] = df.Query_energy * emission_rate
    df['Emissions (gr)'] = (df.Train_emissions + df.Query_emissions)
    df = df[df.Percent <= 10]
    row_list = list()
    for name, group in df.groupby(['Data', 'Query']):
        group.loc[:,'Train_emissions'] = group['Train_emissions'].cumsum()
        group.loc[:,'Query_emissions'] = group['Query_emissions'].cumsum()
        group.loc[:,'Emissions (gr)'] = group['Emissions (gr)'].cumsum()
        row_list.append(group.iloc[group.F1.idxmax(),:])
    stats = pd.DataFrame(row_list)[['F1', 'Query', 'Data','Round',
        'Trainset_absolute', 'Query_absolute', 'Test_absolute', 'Percent',
        'Train_duration',
        'Query_duration',  'Train_emissions', 'Query_emissions',
        'Emissions (gr)']]
    return stats

def get_complete_stats(path = 'PATH/'):
    stats = get_stats(path = path)
    stats.Data = stats.Data.apply(lambda x : helper(x))
    metric = 'Baseline_Emissions'
    stats[metric] = stats.Data.apply(lambda dataset: add_baseline(dataset, metric))
    stats['Baseline_F1'] = stats.Data.apply(lambda dataset: add_baseline(dataset, 'F1'))
    return stats

def join_random(stats):
    rand = stats[stats.Query=='Random Sampling']
    cols = {orig:f"Random_{orig}" for orig in ['F1', 'Percent','Train_emissions', 'Query_emissions', 'Emissions (gr)']}
    rand = rand.rename(columns=cols)
    rand = rand[['Data']+list(cols.values())]
    active = stats[stats.Query!='Random Sampling']
    row_list = list()
    for name, group in active.groupby('Data'):    
        row_list.append(group.iloc[group.reset_index(drop=True).F1.idxmax(),:])
    active = pd.DataFrame(row_list)
    return active.set_index('Data').join(rand.set_index('Data')).reset_index()

def helper(x):
    if x == '10k_newsarticles' or x == '10k_Newsarticles':
        return 'NewsTopic'
    elif x == 'GermevalFactClaiming':
        return 'ClaimDetection'
    elif x == 'COLA':
        return 'Cola'
    else:
        return x

def reformat(stats):    
    stats['Max Difference'] = (stats['F1'] - stats['Baseline_F1'])*100
    stats['Random Difference'] = (stats['F1'] - stats['Random_F1'])*100
    stats['Percent Difference'] = (stats['Percent'] - stats['Random_Percent'])
    stats['Max Training Factor'] = stats['Train_emissions']/stats['Baseline_Emissions']
    stats['Max Query Factor'] = stats['Query_emissions']/stats['Baseline_Emissions']
    size_df = get_data_size()
    size_df.Data = size_df.Data.apply(lambda x : helper(x))
    stats = stats.set_index('Data').join(size_df.set_index('Data')).reset_index()
    return stats[['Data','Query','Percent','Percent Difference','Max Difference', 'Random Difference', 'Max Training Factor', 'Max Query Factor', 'Size']].round(2)

path = '/home/sami/READER_REPO/Stats/Data/ActiveLearner/Paper/'
stats = get_complete_stats()
stats = join_random(stats)
stats.to_csv(path + 'full_stats.csv', index=False)
stats = reformat(stats)
stats.to_csv(path + 'for_plotting.csv', index=False)
stats


Unnamed: 0,Data,F1,Query,Round,Trainset_absolute,Query_absolute,Test_absolute,Percent,Train_duration,Query_duration,Train_emissions,Query_emissions,Emissions (gr),Baseline_Emissions,Baseline_F1,Random_F1,Random_Percent,Random_Train_emissions,Random_Query_emissions,Random_Emissions (gr)
0,Claimbuster,0.8,Breaking Ties,16,915,10616,7060,7.94,78.382991,97.877057,15.091857,34.672649,49.764506,72.915807,0.84,0.77,9.24,20.310407,0.000173,20.31058
1,NewsTopic,0.63,Breaking Ties,9,500,4534,3081,9.93,43.881368,43.579526,4.859323,9.218203,14.077525,32.710314,0.89,0.7,9.93,4.863947,7.8e-05,4.864024


In [None]:
def parameter_tweaking(params:list):
    df_list = list()
    for param in params:
        stats = get_complete_stats(f"PATH/{param}/")
        stats = join_random(stats)
        stats['Parameter'] = param
        df_list.append(stats)
    df = pd.concat(df_list)
    return df
params = [
    'Step50',
    'Step100',
    'Step250',
    'Pool50',
    'Pool70'
]
df = parameter_tweaking(params)
df = df[df.Data.isin(['AG_News', 'Claimbuster', 'NewsTopic'])]
df.to_csv('PATH/ParamTweaking.csv', index = False)

In [None]:
path = 'PATH/*.csv'
dict_list = list()
for file in glob(path):
    name = file.split('/')[-1].split('.')[0]
    df = pd.read_csv(file)
    size = len(df)
    labels = df.label.unique().shape[0]
    balance = list(df.label.value_counts()/len(df))
    dict_list.append(dict(
        name = name,
        size = size,
        label = labels,
        balance = balance
    ))
df = pd.DataFrame(dict_list)
df.sort_values(['size', 'label'])
    