In [2]:
%pip install huggingface_hub pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


    qiskit-terra (>=0.22.*)
                  ~~~~~~~^


In [3]:
from huggingface_hub import HfApi
import pandas as pd

# Setup API

In [4]:
api = HfApi()
api_result = api.list_models(cardData=True, full=False, tags="co2_eq_emissions")
models = list(api_result)
print(f"Found {len(models)} models")

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Found 2143 models


## Get models' co2 emissions
Only return the co2 if exists

In [5]:
def get_emissions(model):
    if not model.card_data:
        return None
    if not 'co2_eq_emissions' in model.card_data:
        return None
    return model.card_data['co2_eq_emissions']

In [6]:
# Get model data in a dictionary
# Keys: {'on_cloud', 'source', 'emissions', 'training_type', 'hours_used', 'cpu_model', 'energy_consumed', 'ram_total_size', 'gpu_model', 'hardware_used', 'geographical_location'}
def get_model_data(model):
    emissions = get_emissions(model)
    emissions_is_dict = isinstance(emissions, dict)
    emissions_value = emissions.get('emissions', None) if emissions_is_dict else emissions
    emissions_source = emissions.get('source', None) if emissions_is_dict else None
    hardware = emissions.get('hardware_used', None) if emissions_is_dict else None
    training_type = emissions.get('training_type', 'pretraining') if emissions_is_dict else 'pretraining'
    training_location = emissions.get('geographical_location', None) if emissions_is_dict else None
    on_cloud = emissions.get('on_cloud', None) if emissions_is_dict else None
    hours_used = emissions.get('hours_used', None) if emissions_is_dict else None
    cpu_model = emissions.get('cpu_model', None) if emissions_is_dict else None
    energy_consumed = emissions.get('energy_consumed', None) if emissions_is_dict else None
    ram_total_size = emissions.get('ram_total_size', None) if emissions_is_dict else None
    
    return {
        'modelId': model.modelId,
        'author': model.author,
        'model_type': model.pipeline_tag,
        'created_at': model.created_at,
        'downloads': model.downloads,
        'likes': model.likes,
        'emissions': emissions_value, #kgCO2eq
        'emissions_source': emissions_source,
        'hardware': hardware,
        'training_type': training_type,
        'training_location': training_location,
        'on_cloud': on_cloud,
        'hours_used': hours_used,
        'cpu_model': cpu_model,
        'energy_consumed': energy_consumed, #kWh
        'ram_total_size': ram_total_size #GB
    }

# Get models
Only retrieve models with co2 emissions available. About 80% of the models have this available. 

In [7]:
emission_models = []
for model in models:
    if not model.card_data:
        continue
    if not 'co2_eq_emissions' in model.card_data:
        continue
    # if not isinstance(model.card_data['co2_eq_emissions'], dict):
    #     continue
    emission_models.append(get_model_data(model))

print(f"Van de {len(models)} modellen op HuggingFace hebben er {len(emission_models)} CO2 emissie data ({len(emission_models)/len(models)*100:.2f}%)")

Van de 2143 modellen op HuggingFace hebben er 2143 CO2 emissie data (100.00%)


In [8]:
df = pd.DataFrame(emission_models)
df['training_type'] = df['training_type'].str.replace('pretraining', 'pre-training')

# Inspect data properties

In [9]:
df.sort_values(by='emissions', ascending=False, inplace=True)
df.head()

Unnamed: 0,modelId,author,model_type,created_at,downloads,likes,emissions,emissions_source,hardware,training_type,training_location,on_cloud,hours_used,cpu_model,energy_consumed,ram_total_size
0,bigscience/bloom,,text-generation,2022-05-19 11:53:33+00:00,6158,4733,24700000.0,"Estimating the Carbon Footprint of BLOOM, a 17...",384 A100 80GB GPUs,pre-training,"Orsay, France",,,,,
382,dalle-mini/dalle-mega,,text-to-image,2022-06-28 14:07:04+00:00,75,146,450300.0,MLCo2 Machine Learning Impact calculator,TTPU v3-256,pre-training,East USA,,,,,
2007,eci-io/climategpt-7b-fsg,,text-generation,2023-12-01 17:05:08+00:00,63,4,265800.0,,8x NVIDIA H100 HBM,pre-training,"Washington, USA",,,,,
2006,eci-io/climategpt-7b-fsc,,text-generation,2023-12-01 17:04:50+00:00,52,6,262800.0,,8x NVIDIA H100 HBM,pre-training,"Washington, USA",,,,,
2123,ysn-rfd/PersianMind-v1.0-Q4_K_M-GGUF,,text-generation,2024-08-19 05:30:58+00:00,4,0,232380.0,,,pre-training,,,,,,


In [12]:
df.sort_values(by='emissions', ascending=False, inplace=True)

# models with emissions data
df_emmission = df[df['emissions'].notnull() & df['emissions'] > 0]

print(f"Null emissions: {len(df[df['emissions'].isnull()])}")
print(f"Zero emissions: {len(df[df['emissions'] == 0])}")

# info about model data fields
print(df_emmission.count())


Null emissions: 0
Zero emissions: 14
modelId              1678
author                  0
model_type           1654
created_at           1678
downloads            1678
likes                1678
emissions            1678
emissions_source      166
hardware              157
training_type        1678
training_location      78
on_cloud               86
hours_used             86
cpu_model              86
energy_consumed        42
ram_total_size         86
dtype: int64


# Emission source tools

In [13]:
# remove None values
emission_sources = df_emmission['emissions_source'].dropna()

emission_sources.value_counts()

codecarbon                                                                                                                                                                                                                                                                                                                          88
CodeCarbon                                                                                                                                                                                                                                                                                                                          29
code carbon                                                                                                                                                                                                                                                                                                                         10
https://mlco2.githu

# Get Evaluation Results
[EvalResult](https://huggingface.co/docs/huggingface_hub/v0.25.0.rc0/en/package_reference/cards#huggingface_hub.EvalResult) (HuggingFace Hub API)

In [39]:
def get_eval_details(model):
    if not model.card_data:
        return None
    if not 'eval_results' in model.card_data or model.card_data['eval_results'] is None:
        return None
    
    eval_results = model.card_data['eval_results']
    res = []

    for evaluation in eval_results:
        eval_details = {}
        for key, value in vars(evaluation).items():
            if value is not None:
              eval_details[key] = value
        
        res.append(eval_details)

    return res

In [43]:
models_with_eval = 0
evals = 0
co2_and_evals = 0
for model in models:
  get_eval_details(model)
  if get_eval_details(model):
    models_with_eval += 1
    evals += len(get_eval_details(model))
    if get_emissions(model):
        co2_and_evals += 1

print(f"{models_with_eval}/{len(models)} models have evaluation results and {evals} evaluations in total (avg {evals/models_with_eval:.2f} per model)")
print(f"{co2_and_evals}/{len(models)} models have CO2 emissions and evaluation results.")

107/2124 models have evaluation results and 1659 evaluations in total (avg 15.50 per model)
107/2124 models have CO2 emissions and evaluation results


In [1]:
import requests

In [2]:
tags = requests.get(
  "https://huggingface.co/api/models-tags-by-type",
  params={},
  headers={}
).json()

# get all keys
keys = tags.keys()
keys

dict_keys(['region', 'other', 'library', 'license', 'language', 'dataset', 'pipeline_tag'])