# Search for models on Hugging Face Model Hub

Make sure the dependencies from `./requirements.txt` are installed before running this notebook.

## Initial setup

In [1]:
from huggingface_hub import HfApi
import os

data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)
api = HfApi()

## Get models from Hugging Face Model Hub

### Find relevant models

For this task, we should focus on all NLP models with at least one like. Unfortunately, it looks like there is no straightforward way to filter for NLP, we have to filter for subcategories of NLP separately. I chose to use the same filters that are available on the [Hugging Face models page](https://huggingface.co/models).

![available NLP-subcategories](./assets/nlp_subcategories.png)

Each filter in the UI is associated with a different filter parameter string. I have manually collected all of those by checking how the URL changes when I apply a filter in the UI.

In [2]:
from typing import List
from huggingface_hub.hf_api import ModelInfo
from tqdm.notebook import tqdm

nlp_category_filters = [
  "text-classification",
  "token-classification",
  "table-question-answering",
  "question-answering",
  "zero-shot-classification",
  "translation",
  "summarization",
  "feature-extraction",
  "text-generation",
  "text2text-generation",
  "fill-mask",
  "sentence-similarity"
]

models_found: List[ModelInfo] = []
for cat_filter in tqdm(nlp_category_filters, desc="Category"):
  returned_models = api.list_models(filter=cat_filter, sort="likes")
  models_matching_criteria: List[ModelInfo] = []
  for model in returned_models:
    if model.likes is not None and model.likes > 0:
      models_matching_criteria.append(model)
    else: 
      break
  print(f"{len(models_matching_criteria)} models found for category {cat_filter}")
  models_found.extend(models_matching_criteria)

Category:   0%|          | 0/12 [00:00<?, ?it/s]

3457 models found for category text-classification
1530 models found for category token-classification
42 models found for category table-question-answering
711 models found for category question-answering
169 models found for category zero-shot-classification
739 models found for category translation
591 models found for category summarization
2269 models found for category feature-extraction
22820 models found for category text-generation
3885 models found for category text2text-generation
1761 models found for category fill-mask
1085 models found for category sentence-similarity


Note: at this point, we unfortunately don't have all the interesting properties of the models (yet). `list_models()` only returns a small subset of relevant properties. We need to call `model_info()` for each model to get all the data.

In [3]:
relevant_model_ids = [model.id for model in models_found]
model_ids_file_path = "./data/relevant_model_ids.txt"
with open(model_ids_file_path, "w") as f:
  for model_id in relevant_model_ids:
    f.write(f"{model_id}\n")

### Download full models, writing each to disk

In [None]:
import json
import numpy as np
from datetime import datetime, date

def get_current_timestamp_str():
    return datetime.strftime(datetime.now(), "%Y-%m-%dT%H:%M:%S.000")


class SafeJSONEncoder(json.JSONEncoder):
    """
    Custom JSON encoder that also supports numpy data types (see https://stackoverflow.com/a/57915246/13727176) and date/datetime objects.
    """

    def default(self, o):
        if isinstance(o, np.integer):
            return int(o)
        if isinstance(o, np.floating):
            return float(o)
        if isinstance(o, np.ndarray):
            return o.tolist()
        if isinstance(o, datetime):
            return o.isoformat()
        if isinstance(o, date):
            return o.isoformat()
        return super(SafeJSONEncoder, self).default(o)

def safe_convert_to_json(data: dict):
    """
    A safer way to convert a dictionary to a JSON string, using our custom SafeJSONEncoder
    which supports numpy data types as well as date and datetime, transforming each into JSON-serializable equivalents.
    """
    return json.dumps(data, cls=SafeJSONEncoder)


In [None]:
from tqdm.notebook import tqdm
import os
from dataclasses import asdict
from requests.exceptions import HTTPError
from typing import List
from huggingface_hub.hf_api import ExpandModelProperty_T

data_dir = "./data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

models_file_path = os.path.join(data_dir, "models.jsonl")

already_loaded_ids = set()
if os.path.exists(models_file_path):
    with open(models_file_path, "r") as f:
        already_loaded_ids = set([json.loads(line)["id"] for line in f.readlines()])

access_restricted_models_file_path = os.path.join(
    data_dir, "access_restricted_models.txt"
)
access_restricted_model_ids = set()
if os.path.exists(access_restricted_models_file_path):
    with open(access_restricted_models_file_path, "r") as f:
        access_restricted_model_ids = set([line.strip() for line in f.readlines()])

ids_to_check = (
    set([model.id for model in models_found])
    - already_loaded_ids
    - access_restricted_model_ids
)

# need to specify all additional fields we want to fetch by passing them in the `expand` parameter
# see also: https://huggingface.co/docs/huggingface_hub/v0.24.2/en/package_reference/hf_api#huggingface_hub.hf_api.ModelInfo:~:text=to%20False.-,expand,-(List%5BExpandModelProperty_T
expand_params: List[ExpandModelProperty_T] = [
    "author",
    "cardData",
    "config",
    "createdAt",
    "disabled",
    "downloads",
    "downloadsAllTime",
    "gated",
    "inference",
    "lastModified",
    "library_name",
    "likes",
    "mask_token",
    "model-index",
    "pipeline_tag",
    "private",
    "safetensors",
    "sha",
    "siblings",
    "spaces",
    "tags",
    "transformersInfo",
    "widgetData",
]

with open(models_file_path, "a") as f:
    for id_to_check in tqdm(ids_to_check):
        try:
            model = api.model_info(id_to_check, expand=expand_params)
            model_dict = asdict(model)
            model_dict["observed_at"] = get_current_timestamp_str()
            f.write(safe_convert_to_json(model_dict) + "\n")
        except HTTPError as e:
            if e.response.status_code == 401:
                with open(access_restricted_models_file_path, "a") as err_f:
                    err_f.write(id_to_check + "\n")
                continue
            else:
                raise

  0%|          | 0/35015 [00:00<?, ?it/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


KeyboardInterrupt: 

## Loading `config.json`

For some reason, the `config` property in the downloaded model files, doesn't contain the whole contents of the `config.json` file that exists for several models on HuggingFace model hub. Let's look at an example:

In [None]:
model_id = "j-hartmann/emotion-english-distilroberta-base"
api.model_info(model_id, expand=expand_params).config

{'architectures': ['RobertaForSequenceClassification'],
 'model_type': 'roberta',
 'tokenizer_config': {'unk_token': '<unk>',
  'bos_token': '<s>',
  'eos_token': '</s>',
  'sep_token': '</s>',
  'cls_token': '<s>',
  'pad_token': '<pad>',
  'mask_token': '<mask>'}}

In [None]:
from huggingface_hub import file_exists

# check if the config.json file exists in the model repository
file_exists("j-hartmann/emotion-english-distilroberta-base", "config.json")

True

In [None]:
# download the actual config.json file
from  huggingface_hub import hf_hub_download

configs_file_path = os.path.join(data_dir, "model_configs.jsonl")
model_ids_with_config = set()
if os.path.exists(configs_file_path):
    with open(configs_file_path, "r") as f:
        model_ids_with_config = set([json.loads(line)["model_id"] for line in f.readlines()])

model_ids_wo_config_file_path = os.path.join(data_dir, "models_without_config.txt")
model_ids_without_config = set()
if os.path.exists(model_ids_wo_config_file_path):
    with open(model_ids_wo_config_file_path, "r") as f:
        model_ids_without_config = set([line.strip() for line in f.readlines()])

def download_config_file(model_id: str):
    if not file_exists(model_id, "config.json"):
        print(f"Config file for model {model_id} does not exist")
        with open(model_ids_wo_config_file_path, "a") as f:
            f.write(model_id + "\n")
        model_ids_without_config.add(model_id)
        return
    if model_id in model_ids_with_config:
        return
    tmp_dir = os.path.join(data_dir, "tmp")
    hf_hub_download(model_id, "config.json", local_dir=tmp_dir, force_download=True)
    downloaded_path = os.path.abspath(os.path.join(tmp_dir, "config.json"))
    with open(downloaded_path, "r") as f:
        config = json.load(f)
        if "model_id" in config:
            raise ValueError("The config file already contains a 'model_id' field")
        config["model_id"] = model_id
    with open (configs_file_path, "a") as f:
        f.write(safe_convert_to_json(config) + "\n")
        model_ids_with_config.add(model_id)

download_config_file("j-hartmann/emotion-english-distilroberta-base")

If you look into the created file, you will see it has more properties than the `config` from `api.model_info` printed above earlier. Hence, we need to load the `config.json` file for each model to get all the properties.

In [None]:
downloaded_model_ids = set()
if os.path.exists(models_file_path):
    with open(models_file_path, "r") as f:
        downloaded_model_ids = set([json.loads(line)["id"] for line in f.readlines()])

model_ids_to_fetch_config_for = downloaded_model_ids - model_ids_with_config - model_ids_without_config
for model_id in tqdm(model_ids_to_fetch_config_for):
    download_config_file(model_id)

## Exploring data with DuckDB

### Setup

In [None]:
import duckdb
import pandas as pd
# No need to import duckdb_engine
#  jupysql will auto-detect the driver needed based on the connection string!

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Set configurations on jupysql to directly output data to Pandas and to simplify the output that is printed to the notebook.
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect to an in-memory DuckDB database
%sql duckdb:///:memory:

In [None]:
# don't want to truncate results
%config SqlMagic.displaylimit = None

### Look into the data

running `DESCRIBE` statements requires a workaround (see [here](https://duckdb.org/docs/api/python/known_issues.html#describe-and-summarize-return-empty-tables-in-jupyter))

In [None]:
%%sql
CREATE VIEW IF NOT EXISTS models AS SELECT * FROM '{{models_file_path}}';
-- regular DESCRIBE won't work
-- DESCRIBE models;
-- the following *does* work:
FROM (DESCRIBE models);

Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,author,JSON,YES,,,
2,sha,JSON,YES,,,
3,last_modified,JSON,YES,,,
4,created_at,VARCHAR,YES,,,
5,private,BOOLEAN,YES,,,
6,gated,JSON,YES,,,
7,disabled,JSON,YES,,,
8,downloads,BIGINT,YES,,,
9,downloads_all_time,JSON,YES,,,


In [None]:
cols = %sql FROM (DESCRIBE models);
config_structure_str = cols.loc[cols.column_name == 'config', 'column_type'].values[0]
config_structure_str.split(', ')

['STRUCT(architectures VARCHAR[]',
 'model_type VARCHAR',
 'tokenizer_config STRUCT(unk_token JSON',
 'sep_token JSON',
 'pad_token JSON',
 'cls_token JSON',
 'mask_token JSON',
 'bos_token JSON',
 'eos_token JSON)',
 'adapter_transformers JSON',
 'auto_map JSON',
 'sklearn JSON)']

In [None]:
%%sql
SELECT DISTINCT downloads_all_time FROM models;

Unnamed: 0,downloads_all_time
0,
