# Computing the Carbon Footprint of a Kaggle Competition

In [1]:
import pandas as pd

In [2]:
PATH = "results/"

# Kaggle Competition: Team 2

## Pipeline1: Pretraining

In [3]:
# This is for 1 fold, out of 5
# This will be used to find OUR carbon footprint
pretrain_1_file = PATH+"siim_pretrain_results_1epoch.csv"
ptrain_1 = pd.read_csv(pretrain_1_file)

ptrain_1

Unnamed: 0,training_file,time,energy(kWh),CO2eq(g),distance(km)
0,cait_pretraining,8:00:33,2.80001,299.601022,2.786986
1,f1_pretraining,11:55:04,4.303103,218.651446,2.033967
2,f3_pretraining,22:14:23,8.025735,229.189462,2.131995
3,l1_pretraining,11:25:01,2.749729,167.122448,1.554627
4,l1b_pretraining,11:06:12,2.648516,69.744261,0.648784
5,l2_pretraining,15:49:29,3.752667,113.056541,1.051689
6,n_cf2_pretraining,10:59:26,2.629704,71.061763,0.66104


In [5]:
# This is for all 5 folds, the estimated cost of running the full code
pretrain_full_file = PATH+"siim_pretrain_results_full.csv"
ptrain_full = pd.read_csv(pretrain_full_file)

ptrain_full

Unnamed: 0,training_file,time,energy(kWh),CO2eq(g),distance(km)
0,cait_pretraining,40:02:47,14.000048,1126.210028,10.476372
1,f1_pretraining,59:35:21,21.515517,1024.948272,9.534403
2,f3_pretraining,111:11:54,40.128676,1194.29867,11.109755
3,l1_pretraining,57:05:03,13.748647,631.333879,5.872873
4,l1b_pretraining,55:31:01,13.242581,334.029285,3.107249
5,l2_pretraining,79:07:25,18.763335,516.33736,4.803138
6,n_cf2_pretraining,54:57:12,13.148518,431.759478,4.016367


## Pipeline1: Training

In [7]:
# This is for 1 fold, out of 5
# This will be used to find OUR carbon footprint
train_1_file = PATH+"siim_train_results_1epoch.csv"
train_1 = pd.read_csv(train_1_file)

train_1

Unnamed: 0,training_file,time,energy(kWh),CO2eq(g),distance(km)
0,n_cf11,1:38:42,0.194869,9.520723,0.088565
1,n_cf11_1,7:33:10,0.578794,25.746986,0.239507
2,n_cf11_9,1:24:16,0.189512,10.865354,0.101073
3,n_cf11_10,1:50:25,0.231844,8.607216,0.080067
4,n_cf11_6,1:59:30,0.53086,19.907256,0.185184
5,n_cf11_7,1:46:22,0.285051,11.972152,0.111369
6,n_cf11_rot1,0:52:31,0.170745,6.787123,0.063136


In [8]:
# This is for all 5 folds, the estimated cost of running the full code
train_full_file = PATH+"siim_train_results_full.csv"
train_full = pd.read_csv(train_full_file)

train_full

Unnamed: 0,training_file,time,energy(kWh),CO2eq(g),distance(km)
0,n_cf11,8:13:32,0.974343,42.493534,0.395289
1,n_cf11_1,37:45:51,2.893969,168.435306,1.56684
2,n_cf11_9,7:01:21,0.94756,50.139455,0.466414
3,n_cf11_10,9:12:03,1.159221,34.338704,0.31943
4,n_cf11_6,9:57:29,2.654301,107.277989,0.997935
5,n_cf11_7,8:51:50,1.425256,58.111582,0.540573
6,n_cf11_rot1,4:22:35,0.853726,33.062484,0.307558


## Calculating Our Carbon Footprint

In [10]:
def calc_footprint(ptrain_df, train_df, ptrain_runs=1, train_runs=1):
    """
    Function to make a dataframe containing the carbon footprint
    of each state in the training.
    ------------------------------
    Parameters
    ptrain_df: the pretraining dataframe
    train_df: the training dataframe
    ptrain_runs: the number of times we ran pretraining
    train_runs: the number of times we ran training
    """
    # Make new df
    siim_fp = pd.DataFrame(columns=["stage","CO2eq(g)","distance(km)"])
    
    # Compute Sums
    ptrain_co2sum = ptrain_df["CO2eq(g)"].sum() * ptrain_runs       # MULTIPLY BY NUMBER OF TIMES RAN
    ptrain_dist_sum = ptrain_df["distance(km)"].sum() * ptrain_runs # MULTIPLY BY NUMBER OF TIMES RAN

    train_co2sum = train_df["CO2eq(g)"].sum() * train_runs        # MULTIPLY BY NUMBER OF TIMES RAN
    train_dist_sum = train_df["distance(km)"].sum() * train_runs  # MULTIPLY BY NUMBER OF TIMES RAN

    # Make new rows
    row1 = pd.DataFrame({"stage":"ptrain", "CO2eq(g)":ptrain_co2sum, "distance(km)":ptrain_dist_sum}, index=[0])
    row2 = pd.DataFrame({"stage":"train", "CO2eq(g)":train_co2sum, "distance(km)":train_dist_sum}, index=[1])

    # Add to df
    siim_fp = pd.concat([row1, siim_fp.loc[:]]).reset_index(drop=True)
    siim_fp = pd.concat([row2, siim_fp.loc[:]])

    return siim_fp

In [11]:
# Carbon footprint of 1 epoch 
# NEED TO ADD IN THE CORRECT NUMBER OF TIMES WE RAN EACH STAGE
calc_footprint(ptrain_1, train_1, 1, 1)

Unnamed: 0,stage,CO2eq(g),distance(km)
1,train,93.40681,0.868901
0,ptrain,1168.426943,10.869088


# Calculating Team 2's Carbon Footprint

In [23]:
# Carbon footprint of the full training, 
team2_totals = calc_footprint(ptrain_full, train_full, 1, 1)
team2_totals

Unnamed: 0,stage,CO2eq(g),distance(km)
1,train,493.859054,4.594039
0,ptrain,5258.916972,48.920157


In [24]:
print(f"Team 2's CO2 total usage (not including pipeline2): {round(team2_totals['CO2eq(g)'].sum(),2)}g")
print(f"Team 2's distance total (not including pipeline2): {round(team2_totals['distance(km)'].sum(),2)}km")

Team 2's CO2 total usage (not including pipeline2): 5752.78g
Team 2's distance total (not including pipeline2): 53.51km


# RevisitingTransfer

## Base: ImageNet Freeze=False
These results are for the number of epochs in the column "epochs"<br>
The column "class_epochs" is the number of epochs during training the classification layer<br>
We calculate the cost per single epoch then compute the estimated cost for all of the training (including training of the classification layer)

In [62]:
# Load results file
rt_frz_false_file = PATH+"rt_imagenet_frzfalse_results.csv"
rt_frz_false = pd.read_csv(rt_frz_false_file)

# Add string to dataset name
rt_frz_false["dataset"] = rt_frz_false["dataset"].astype(str) + "_frz_false"

# Calculate total epochs ran
rt_frz_false["total_epochs"] = rt_frz_false["epochs"] + rt_frz_false["class_epochs"]

# Get CO2eq and distance per single epoch
rt_frz_false["CO2eq(g)/e"] = rt_frz_false["CO2eq(g)"] / rt_frz_false["epochs"]
rt_frz_false["distance(km)/e"] = rt_frz_false["distance(km)"] / rt_frz_false["epochs"]

# Get Total CO2eq and distance per model
rt_frz_false["total_CO2eq(g)"] = rt_frz_false["CO2eq(g)/e"] * rt_frz_false["total_epochs"]
rt_frz_false["total_distance(km)"] = rt_frz_false["distance(km)/e"] * rt_frz_false["total_epochs"]

# Round values
rt_frz_false[["total_CO2eq(g)","total_distance(km)"]] = rt_frz_false[["total_CO2eq(g)","total_distance(km)"]].round(3)

## Base: ImageNet Freeze=True
These results are for the number of epochs in the column "epochs"<br>
The column "class_epochs" is the number of epochs during training the classification layer<br>
We calculate the cost per single epoch then compute the estimated cost for all of the training (including training of the classification layer)

In [55]:
# Load results file
rt_frz_true_file = PATH+"rt_imagenet_frztrue_results.csv"
rt_frz_true = pd.read_csv(rt_frz_true_file)

# calculate total epochs
rt_frz_true["total_epochs"] = rt_frz_true["epochs"] + rt_frz_true["class_epochs"]

# Add string to dataset name
rt_frz_true["dataset"] = rt_frz_true["dataset"].astype(str) + "_frz_true"

# Calculate total epochs ran
rt_frz_true["total_epochs"] = rt_frz_true["epochs"] + rt_frz_true["class_epochs"]

# Get CO2eq and distance per single epoch
rt_frz_true["CO2eq(g)/e"] = rt_frz_true["CO2eq(g)"] / rt_frz_true["epochs"]
rt_frz_true["distance(km)/e"] = rt_frz_true["distance(km)"] / rt_frz_true["epochs"]

# Get Total CO2eq and distance per model
rt_frz_true["total_CO2eq(g)"] = rt_frz_true["CO2eq(g)/e"] * rt_frz_true["total_epochs"]
rt_frz_true["total_distance(km)"] = rt_frz_true["distance(km)/e"] * rt_frz_true["total_epochs"]

# Round values
rt_frz_true[["total_CO2eq(g)","total_distance(km)"]] = rt_frz_true[["total_CO2eq(g)","total_distance(km)"]].round(3)

In [56]:
# Concat both revisiting transfer datasets (freeze=true and freeze=false)
revisit_transfer = pd.concat([rt_frz_true, rt_frz_false])

# Show just the total columns
revisit_transfer[["dataset","total_CO2eq(g)","total_distance(km)"]]

Unnamed: 0,dataset,total_CO2eq(g),total_distance(km)
0,isic_frz_true,7.601,0.071
1,breast_frz_true,5.054,0.047
2,chest_frz_true,13.117,0.122
3,knee_frz_true,4.784,0.045
4,thyroid_frz_true,1.583,0.015
0,isic_frz_false,120.219,1.118
1,breast_frz_false,12.572,0.117
2,chest_frz_false,36.428,0.339
3,knee_frz_false,18.384,0.171
4,thyroid_frz_false,1.749,0.016


In [58]:
min_train_cost = revisit_transfer['total_CO2eq(g)'].min()
print(f"The minimum carbon footprint of training a single model is {min_train_cost}g CO2 emissions")

The minimum carbon footprint of training a single model is 1.583g CO2 emissions


# Average Cost to Train a Single Model

In [18]:
# CO2 cost

# models trained in RT with freeze=False + models trained in RT with freeze=False + the single Kaggle project
# the kaggle project is actually training 14 models in pipeline1, that we had tracked
total_models_trained = len(rt_frz_false) + len(rt_frz_true) + 1
CO2_cost_per_model = (rt_frz_true["total_CO2eq(g)"].sum() + rt_frz_false["total_CO2eq(g)"].sum() + team2_totals["CO2eq(g)"].sum()) / total_models_trained
print(f"The average cost of training a single model is {round(CO2_cost_per_model,2)}g CO2eq")

The average cost of training a single model is 543.12g CO2eq


In [19]:
# distance cost
dist_cost_per_model = (rt_frz_true["total_distance(km)"].sum() + rt_frz_false["total_distance(km)"].sum() + team2_totals["distance(km)"].sum()) / total_models_trained
print(f"The average distance cost of training a single model is {round(dist_cost_per_model,2)}km")

The average distance cost of training a single model is 5.05km


# Carbon Footprint of the Whole Competition

Total number of entries from Veronika's supplementary [information](https://static-content.springer.com/esm/art%3A10.1038%2Fs41746-022-00592-y/MediaObjects/41746_2022_592_MOESM1_ESM.pdf).

In [60]:
# Total entries
total_entries = 32307

# Total teams with more than 2 submissions
total_entries_more_than_2 = 31751

In [61]:
# Lower bound = lowest model training cost * total entries (excluding teams who submitted 2 or less times)
lower_bound = total_entries_more_than_2 * min_train_cost
lower_bound

50261.833

In [21]:
whole_comp_cost = total_entries * CO2_cost_per_model

print(f"The estimated carbon footprint of the whole competition is {round(whole_comp_cost,2)}g CO2eq")
print(f"Which is equivalent to {round(275787.835915,2)}lbs of CO2")

Which is equivalent to 275787.84lbs of CO2


In [22]:
whole_comp_dist = total_entries * dist_cost_per_model
print(f"The estimated carbon footprint of the whole competition in distance driven {round(whole_comp_dist,2)} km")

The estimated carbon footprint of the whole competition in distance driven 163224.35 km


Amount of teams with submissions over 2: 