In [1]:
! pip install huggingface_hub



# All imports necessary

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
from datetime import datetime

In [5]:
from sqlalchemy import create_engine

In [6]:
import json

In [7]:
from huggingface_hub import HfApi

In [8]:
from huggingface_hub import ModelSearchArguments, DatasetSearchArguments

In [9]:
from prefect.client import Secret

In [10]:
import psycopg2

In [11]:
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

# Add some configuration

In [12]:
dir(pd.options.display)

['chop_threshold',
 'colheader_justify',
 'date_dayfirst',
 'date_yearfirst',
 'encoding',
 'expand_frame_repr',
 'float_format',
 'html',
 'large_repr',
 'max_categories',
 'max_columns',
 'max_colwidth',
 'max_dir_items',
 'max_info_columns',
 'max_info_rows',
 'max_rows',
 'max_seq_items',
 'memory_usage',
 'min_rows',
 'multi_sparse',
 'notebook_repr_html',
 'pprint_nest_depth',
 'precision',
 'show_dimensions',
 'unicode',
 'width']

In [13]:
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = None

In [14]:
n = 20

# Get data from Hugging Face

In [15]:
api = HfApi()

In [15]:
hugging_face_models = list(iter(api.list_models()))
hugging_face_datasets = list(iter(api.list_datasets()))

In [16]:
print(len(hugging_face_models))
print(len(hugging_face_datasets))

216433
38047


In [17]:
for i in range(len(hugging_face_models)):
    hugging_face_models[i] = hugging_face_models[i].__dict__

In [18]:
for i in range(len(hugging_face_datasets)):
    hugging_face_datasets[i] = hugging_face_datasets[i].__dict__

In [19]:
hugging_face_models_json = json.dumps(hugging_face_models)
hugging_face_datasets_json = json.dumps(hugging_face_datasets)

# Hugging Face Models

In [20]:
models_slice_df = pd.read_json(hugging_face_models_json)

In [21]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",,[],False,,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",,[],False,,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137


In [22]:
models_slice_df.fillna(value={"pipeline_tag": "unknown-tag"}, inplace=True)

In [23]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",unknown-tag,[],False,,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",unknown-tag,[],False,,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137


In [24]:
models_slice_df["author"] = models_slice_df.modelId.apply(lambda x: x.split("/")[0] if "/" in x else "unknown-author")

In [25]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,runwayml,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,bigscience,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,WarriorMama777,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",unknown-tag,[],False,lllyasviel,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,stabilityai,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,prompthero,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,andite,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",unknown-tag,[],False,THUDM,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137


In [26]:
models_slice_df["datasets"] = models_slice_df.tags.apply(lambda tags_array: [item for item in tags_array if "dataset:" in item])

In [27]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads,datasets
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,runwayml,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705,[]
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801,[]
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,bigscience,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011,[]
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,WarriorMama777,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839,[dataset:Nerfgun3/bad_prompt]
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",unknown-tag,[],False,lllyasviel,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0,[]
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,stabilityai,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410,[]
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,prompthero,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189,[]
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0,[]
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,andite,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668,[]
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",unknown-tag,[],False,THUDM,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137,[]


In [28]:
models_slice_df["has_space"] = models_slice_df.tags.apply(lambda tags_array: "has_space" in tags_array)

In [29]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads,datasets,has_space
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,runwayml,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705,[],True
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801,[],True
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,bigscience,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011,[],True
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,WarriorMama777,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839,[dataset:Nerfgun3/bad_prompt],True
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",unknown-tag,[],False,lllyasviel,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0,[],True
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,stabilityai,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410,[],True
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,prompthero,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189,[],True
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0,[],True
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,andite,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668,[],True
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",unknown-tag,[],False,THUDM,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137,[],True


In [30]:
models_slice_df["has_space"].value_counts()

has_space
False    207779
True       8654
Name: count, dtype: int64

In [31]:
models_slice_df["pipeline_tag"].value_counts()

pipeline_tag
unknown-tag                       92254
text-classification               23520
reinforcement-learning            21087
text2text-generation              14101
text-generation                   12601
token-classification               9052
automatic-speech-recognition       8381
fill-mask                          7036
question-answering                 4993
text-to-image                      4354
image-classification               3945
feature-extraction                 3909
conversational                     2178
translation                        2125
sentence-similarity                1958
summarization                      1127
unconditional-image-generation      681
audio-classification                539
object-detection                    429
multiple-choice                     313
text-to-speech                      291
image-segmentation                  217
audio-to-audio                      155
image-to-text                       153
tabular-classification     

In [32]:
domains = [
    "computer-vision",
    "natural-language-processing",
    "audio",
    "tabular",
    "multimodal",
    "reinforcement-learning",
    "unknown-domain",
    "time-series",
    "graph",
    "robotics"
]

In [33]:
task2domain = {
    "text-classification":            "natural-language-processing",
    "reinforcement-learning":         "reinforcement-learning",
    "text2text-generation":           "natural-language-processing",
    "text-generation":                "natural-language-processing",
    "token-classification":           "natural-language-processing",
    "automatic-speech-recognition":   "audio",
    "fill-mask":                      "natural-language-processing",
    "question-answering":             "natural-language-processing",
    "text-to-image":                  "multimodal",
    "feature-extraction":             "multimodal",
    "image-classification":           "computer-vision",
    "conversational":                 "natural-language-processing",
    "translation":                    "natural-language-processing",
    "sentence-similarity":            "natural-language-processing",
    "summarization":                  "natural-language-processing",
    "unconditional-image-generation": "computer-vision",
    "audio-classification":           "audio",
    "object-detection":               "computer-vision",
    "multiple-choice":                "unknown-domain",
    "text-to-speech":                 "audio",
    "image-segmentation":             "computer-vision",
    "audio-to-audio":                 "audio",
    "image-to-text":                  "multimodal",
    "tabular-classification":         "tabular",
    "zero-shot-image-classification": "computer-vision",
    "zero-shot-classification":       "natural-language-processing",
    "video-classification":           "computer-vision",
    "image-to-image":                 "computer-vision",
    "tabular-regression":             "tabular",
    "table-question-answering":       "natural-language-processing",
    "depth-estimation":               "computer-vision",
    "document-question-answering":    "multimodal",
    "text-to-video":                  "multimodal",
    "visual-question-answering":      "multimodal",
    "voice-activity-detection":       "audio",
    "robotics":                       "robotics",
    "other":                          "unknown-domain",
    "graph-ml":                       "graph",
    "time-series-forecasting":        "time-series"
}

In [34]:
models_slice_df["domain"] = models_slice_df.pipeline_tag.apply(lambda tag: task2domain.get(tag, "unknown-domain"))

In [35]:
models_slice_df["slice_datetime"] = datetime.now()

In [36]:
models_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

Unnamed: 0,modelId,sha,lastModified,tags,pipeline_tag,siblings,private,author,config,securityStatus,_id,id,likes,downloads,datasets,has_space,domain,slice_datetime
77446,runwayml/stable-diffusion-v1-5,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,runwayml,,,63508a7bfc22005c1aae0266,runwayml/stable-diffusion-v1-5,7948,2453705,[],True,multimodal,2023-05-31 15:09:04.407835
63900,CompVis/stable-diffusion-v1-4,,,"[diffusers, arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,6300e0f58a4db6e9052953a8,CompVis/stable-diffusion-v1-4,5560,980801,[],True,multimodal,2023-05-31 15:09:04.407835
43855,bigscience/bloom,,,"[pytorch, tensorboard, safetensors, bloom, text-generation, ak, ar, as, bm, bn, ca, code, en, es, eu, fon, fr, gu, hi, id, ig, ki, kn, lg, ln, ml, mr, ne, nso, ny, or, pa, pt, rn, rw, sn, st, sw, ta, te, tn, ts, tum, tw, ur, vi, wo, xh, yo, zh, zu, arxiv:2211.05100, arxiv:1909.08053, arxiv:2110.02861, arxiv:2108.12409, doi:10.57967/hf/0003, transformers, license:bigscience-bloom-rail-1.0, model-index, co2_eq_emissions, has_space]",text-generation,[],False,bigscience,,,62862fbd504d37700308a82e,bigscience/bloom,3592,24011,[],True,natural-language-processing,2023-05-31 15:09:04.407835
92772,WarriorMama777/OrangeMixs,,,"[diffusers, dataset:Nerfgun3/bad_prompt, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,WarriorMama777,,,638cac3a61eb5101751a23c4,WarriorMama777/OrangeMixs,3028,6839,[dataset:Nerfgun3/bad_prompt],True,multimodal,2023-05-31 15:09:04.407835
127789,lllyasviel/ControlNet,,,"[license:openrail, has_space]",unknown-tag,[],False,lllyasviel,,,63e3ef298de575a15a63c2b1,lllyasviel/ControlNet,2898,0,[],True,unknown-domain,2023-05-31 15:09:04.407835
93946,stabilityai/stable-diffusion-2-1,,,"[diffusers, arxiv:2112.10752, arxiv:2202.00512, arxiv:1910.09700, stable-diffusion, text-to-image, license:openrail++, has_space]",text-to-image,[],False,stabilityai,,,638f7ae36c25af4071044105,stabilityai/stable-diffusion-2-1,2598,346410,[],True,multimodal,2023-05-31 15:09:04.407835
82911,prompthero/openjourney,,,"[diffusers, safetensors, en, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,prompthero,,,636a251a2ca1d75cb49109ee,prompthero/openjourney,2576,639189,[],True,multimodal,2023-05-31 15:09:04.407835
63885,CompVis/stable-diffusion-v-1-4-original,,,"[arxiv:2207.12598, arxiv:2112.10752, arxiv:2103.00020, arxiv:2205.11487, arxiv:1910.09700, stable-diffusion, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,CompVis,,,63009e8b79c5ddbc6cf69877,CompVis/stable-diffusion-v-1-4-original,2363,0,[],True,multimodal,2023-05-31 15:09:04.407835
112812,andite/anything-v4.0,,,"[diffusers, en, stable-diffusion, stable-diffusion-diffusers, text-to-image, license:creativeml-openrail-m, has_space]",text-to-image,[],False,andite,,,63c1595e94b28327f0e821e8,andite/anything-v4.0,2241,71668,[],True,multimodal,2023-05-31 15:09:04.407835
149336,THUDM/chatglm-6b,,,"[pytorch, chatglm, zh, en, arxiv:2103.10360, arxiv:2210.02414, transformers, glm, thudm, has_space]",unknown-tag,[],False,THUDM,,,640f4f1409c94e1d9bca3ffc,THUDM/chatglm-6b,2110,736137,[],True,unknown-domain,2023-05-31 15:09:04.407835


In [37]:
models_slice_df.dtypes

modelId                   object
sha                      float64
lastModified             float64
tags                      object
pipeline_tag              object
siblings                  object
private                     bool
author                    object
config                   float64
securityStatus           float64
_id                       object
id                        object
likes                      int64
downloads                  int64
datasets                  object
has_space                   bool
domain                    object
slice_datetime    datetime64[ns]
dtype: object

In [38]:
models_slice_df.siblings.value_counts()

siblings
[]    216433
Name: count, dtype: int64

In [39]:
models_slice_df.sha.value_counts()

Series([], Name: count, dtype: int64)

In [40]:
models_slice_df.lastModified.value_counts()

Series([], Name: count, dtype: int64)

In [41]:
models_slice_df.private.value_counts()

private
False    216433
Name: count, dtype: int64

In [42]:
models_slice_df.config.value_counts()

Series([], Name: count, dtype: int64)

In [43]:
models_slice_df.securityStatus.value_counts()

Series([], Name: count, dtype: int64)

In [44]:
for col in models_slice_df.columns:
    print(col)

modelId
sha
lastModified
tags
pipeline_tag
siblings
private
author
config
securityStatus
_id
id
likes
downloads
datasets
has_space
domain
slice_datetime


In [45]:
models_slice_df.isna().sum()

modelId                0
sha               216433
lastModified      216433
tags                   0
pipeline_tag           0
siblings               0
private                0
author                 0
config            216433
securityStatus    216433
_id                    0
id                     0
likes                  0
downloads              0
datasets               0
has_space              0
domain                 0
slice_datetime         0
dtype: int64

In [46]:
models_slice_df.rename(columns={col: col.lower() for col in models_slice_df.columns}, inplace=True)

In [47]:
for col in models_slice_df.columns:
    print(col)

modelid
sha
lastmodified
tags
pipeline_tag
siblings
private
author
config
securitystatus
_id
id
likes
downloads
datasets
has_space
domain
slice_datetime


In [57]:
len(models_slice_df[(models_slice_df.pipeline_tag == "unknown-tag") & (models_slice_df.modelid.apply(lambda x: "gpt" in x.lower()))])

2921

In [58]:
len(models_slice_df[(models_slice_df.pipeline_tag == "unknown-tag") & (models_slice_df.modelid.apply(lambda x: "bert" in x.lower()))])

10314

In [59]:
len(models_slice_df[(models_slice_df.pipeline_tag == "unknown-tag")])

92254

In [60]:
92254 - 10314 - 2921

79019

In [None]:
plt.figure(figsize=(30, 20))
plt.rcParams.update({"font.size": 22})

models_slice_df["domain"].value_counts().plot(kind='barh')

plt.grid(True)
plt.show()

# Hugging Face Datasets

In [None]:
datasets_slice_df = pd.read_json(hugging_face_datasets_json)

In [None]:
datasets_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

In [None]:
datasets_slice_df["tasks"] = datasets_slice_df.tags.apply(lambda tags_array: [item.split(":")[1] for item in tags_array if "task_categories:" in item])

In [None]:
datasets_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

In [None]:
datasets_slice_df["size"] = datasets_slice_df.tags.apply(lambda tags_array: [item.split(":")[1] for item in tags_array if "size_categories:" in item])

In [None]:
datasets_slice_df.sort_values(["likes", "downloads"], ascending=False).head(n)

In [None]:
datasets_slice_df["slice_datetime"] = datetime.now()

In [None]:
datasets_slice_df.isna().sum()

In [None]:
for col in datasets_slice_df.columns:
    print(col)

In [None]:
datasets_slice_df.rename(columns={col: col.lower() for col in datasets_slice_df.columns}, inplace=True)

In [None]:
for col in datasets_slice_df.columns:
    print(col)

In [None]:
datasets_slice_df.rename(columns={"id": "datasetid"}, inplace=True)

In [None]:
for col in datasets_slice_df.columns:
    print(col)

# Check DB connection

In [None]:
db_host = Secret("DB_HOST").get()
db_user = Secret("DB_USER").get()
db_password = Secret("DB_PASSWORD").get()

In [None]:
# Or as a mix of both. The basic connection parameters are:

# - *dbname*: the database name
# - *database*: the database name (only as keyword argument)
# - *user*: user name used to authenticate
# - *password*: password used to authenticate
# - *host*: database host address (defaults to UNIX socket if not provided)
# - *port*: connection port number (defaults to 5432 if not provided)
con = psycopg2.connect(user=db_password, password=db_user, host=db_host, port=5432)

con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

cursor = con.cursor()

## Create a database

In [None]:
cursor.execute("CREATE DATABASE RPD_5310")

In [None]:
cursor.execute("SELECT datname FROM pg_database WHERE datistemplate = false")

for row in cursor:
    print(row)

In [None]:
con.close()

## Create tables

In [None]:
con = psycopg2.connect(user=db_password, password=db_user, host=db_host, port=5432, dbname="rpd_5310")

con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

cursor = con.cursor()

In [None]:
with open("create_tables.sql", "r") as file:
    cursor.execute(file.read())

In [None]:
cursor.execute("select * from pg_catalog.pg_tables where schemaname='public'")

for row in cursor:
    print(row)

In [None]:
template = "SELECT table_name, column_name, is_nullable, is_identity, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name = '{}'"

In [None]:
cursor.execute(template.format("hugging_face_models"))

for row in cursor:
    print(row)

In [None]:
cursor.execute(template.format("hugging_face_models_daily_dynamics"))

for row in cursor:
    print(row)

In [None]:
cursor.execute(template.format("hugging_face_datasets"))

for row in cursor:
    print(row)

In [None]:
cursor.execute(template.format("hugging_face_datasets_daily_dynamics"))

for row in cursor:
    print(row)

In [None]:
con.close()

## Drop a database

In [None]:
con = psycopg2.connect(user=db_password, password=db_user, host=db_host, port=5432)

con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

cursor = con.cursor()

In [None]:
cursor.execute("DROP DATABASE RPD_5310")

In [None]:
cursor.execute("SELECT datname FROM pg_database WHERE datistemplate = false")

for row in cursor:
    print(row)

In [None]:
con.close()

# Write data to DB

In [None]:
engine = create_engine('postgresql://{}:{}@{}:5432/rpd_5310'.format(db_user, db_password, db_host))

In [None]:
models_df = pd.read_sql('hugging_face_models', engine)

In [None]:
models_df.head()

In [None]:
len(models_slice_df[~models_slice_df.modelid.isin(models_df.modelid)])

In [None]:
datasets_df = pd.read_sql('hugging_face_datasets', engine)

In [None]:
datasets_df.head()

In [None]:
len(datasets_slice_df[~datasets_slice_df.datasetid.isin(datasets_df.datasetid)])

In [None]:
models_mask = [col for col in models_df.columns if col not in ["likes", "downloads"]]
datasets_mask = [col for col in datasets_df.columns if col not in ["likes", "downloads"]]

In [None]:
if len(models_slice_df[~models_slice_df.modelid.isin(models_df.modelid)]) > 0:
    models_slice_df[~models_slice_df.modelid.isin(models_df.modelid)][models_mask].to_sql('hugging_face_models', engine, if_exists='append', index=False)

In [None]:
if len(datasets_slice_df[~datasets_slice_df.datasetid.isin(datasets_df.datasetid)]) > 0:
    datasets_slice_df[~datasets_slice_df.datasetid.isin(datasets_df.datasetid)][datasets_mask].to_sql('hugging_face_datasets', engine, if_exists='append', index=False)

In [None]:
models_mask = ["modelid", "likes", "downloads", "slice_datetime"]
datasets_mask = ["datasetid", "likes", "downloads", "slice_datetime"]

In [None]:
models_slice_df[models_mask].to_sql('hugging_face_models_daily_dynamics', engine, if_exists='append', index=False)

In [None]:
datasets_slice_df[datasets_mask].to_sql('hugging_face_datasets_daily_dynamics', engine, if_exists='append', index=False)

# Register and run Prefect flow

In [None]:
! prefect get projects

In [None]:
! prefect create project "RPD_5310" --description "This is the test project for task RPD-5310 (Getting data from HuggingFace about models and datasets)"

In [None]:
! prefect get projects

In [None]:
! prefect register --project RPD_5310 -p flow.py -l local-agent

In [None]:
! prefect get flows

In [None]:
! prefect describe flows --name "update huggingface data"

In [None]:
! prefect run -n "update huggingface data" --run-name "test run" --log-level DEBUG --watch

In [None]:
! prefect get flow-runs --flow "update huggingface data"

# Update 