# Model and Model Cards Overview

In [21]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
import pickle
import datetime
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

## Model Information Overview

In [26]:
model_info = pd.read_parquet('data/model_info.parquet')
model_info

Unnamed: 0,modelId,author,creation_time,downloads,has_modelcard,task_category,task_domain
0,bert-base-uncased,huggingface,2018-11-14 23:35:08+00:00,31924518.0,True,fill-mask,nlp
1,xlm-roberta-base,huggingface,2019-12-18 22:37:14+00:00,26796072.0,True,fill-mask,nlp
2,Jean-Baptiste/camembert-ner,Jean-Baptiste,2021-03-12 14:22:04+00:00,16720459.0,True,token-classification,nlp
3,openai/clip-vit-large-patch14,openai,2022-01-26 11:08:26+00:00,12495361.0,True,zero-shot-image-classification,computer_vision
4,roberta-base,huggingface,2019-08-03 22:16:18+00:00,12472035.0,True,fill-mask,nlp
...,...,...,...,...,...,...,...
74965,huseinzol05/text-to-speech-tacotron-osman,huseinzol05,2022-05-03 14:08:28+00:00,0.0,False,unknown,unknown
74966,huseinzol05/text-to-speech-tacotron-osman-quan...,huseinzol05,2022-05-03 14:08:44+00:00,0.0,False,unknown,unknown
74967,dennishe97/codebert-base-v2,dennishe97,2022-05-03 14:09:52+00:00,0.0,False,feature-extraction,multimodal
74968,netoass/distilbert-base-uncased-finetuned-emotion,netoass,2022-05-03 14:27:42+00:00,0.0,False,unknown,unknown


In [27]:
# drop nan
model_info = model_info.dropna(subset=['creation_time'])
model_info['creation_time'] = model_info['creation_time'].apply(lambda x: x.date())
model_info

Unnamed: 0,modelId,author,creation_time,downloads,has_modelcard,task_category,task_domain
0,bert-base-uncased,huggingface,2018-11-14,31924518.0,True,fill-mask,nlp
1,xlm-roberta-base,huggingface,2019-12-18,26796072.0,True,fill-mask,nlp
2,Jean-Baptiste/camembert-ner,Jean-Baptiste,2021-03-12,16720459.0,True,token-classification,nlp
3,openai/clip-vit-large-patch14,openai,2022-01-26,12495361.0,True,zero-shot-image-classification,computer_vision
4,roberta-base,huggingface,2019-08-03,12472035.0,True,fill-mask,nlp
...,...,...,...,...,...,...,...
74964,egorulz/distilbert-base-uncased-finetuned-cola,egorulz,2022-05-03,0.0,False,unknown,unknown
74965,huseinzol05/text-to-speech-tacotron-osman,huseinzol05,2022-05-03,0.0,False,unknown,unknown
74966,huseinzol05/text-to-speech-tacotron-osman-quan...,huseinzol05,2022-05-03,0.0,False,unknown,unknown
74967,dennishe97/codebert-base-v2,dennishe97,2022-05-03,0.0,False,feature-extraction,multimodal


## Model Card Information Overview

In [29]:
modelcard_info = pd.read_parquet('data/modelcard_info.parquet')
modelcard_info

Unnamed: 0,modelId,author,creation_time,downloads,model_card,word_cnt,task_category,task_domain
0,bert-base-uncased,huggingface,2018-11-14 23:35:08+00:00,31924518,\n# BERT base model (uncased)\n\nPretrained mo...,1119.0,fill-mask,nlp
1,xlm-roberta-base,huggingface,2019-12-18 22:37:14+00:00,26796072,\n# XLM-RoBERTa (base-sized model) \n\nXLM-RoB...,584.0,fill-mask,nlp
2,Jean-Baptiste/camembert-ner,Jean-Baptiste,2021-03-12 14:22:04+00:00,16720459,\n# camembert-ner: model fine-tuned from camem...,288.0,token-classification,nlp
3,openai/clip-vit-large-patch14,openai,2022-01-26 11:08:26+00:00,12495361,\n# Model Card: CLIP\n\nDisclaimer: The model ...,1069.0,zero-shot-image-classification,computer_vision
4,roberta-base,huggingface,2019-08-03 22:16:18+00:00,12472035,\n# RoBERTa base model\n\nPretrained model on ...,1063.0,fill-mask,nlp
...,...,...,...,...,...,...,...,...
32106,skops-ci/test-4ac6e53e-616d-41a1-9e68-4c58109d...,skops-ci,2022-09-26 16:16:06+00:00,0,\n# Model description\n\n[More Information Nee...,102.0,tabular-regression,tabular
32107,skops-ci/test-f84870ba-aaae-4fbf-ad64-fd7e536a...,skops-ci,2022-09-26 16:16:03+00:00,0,\n# Model description\n\n[More Information Nee...,102.0,tabular-regression,tabular
32108,skops-ci/test-192b88bc-8412-43b6-965a-7c05c4ab...,skops-ci,2022-09-26 16:16:00+00:00,0,\n# Model description\n\n[More Information Nee...,115.0,tabular-classification,tabular
32109,skops-ci/test-28b17e1a-1c77-47a1-a577-62c40774...,skops-ci,2022-09-26 16:13:38+00:00,0,\n# Model description\n\n[More Information Nee...,114.0,tabular-classification,tabular


## Model Cards Adoption and Downloads Traffic

In [30]:
print('Number of Models:', len(model_info))
has_card = model_info[model_info['has_modelcard'] == True]
print('Number of Models with Model Cards:', len(has_card))
print('Percentage of Models with Model Cards:{:.2f}%'.format(len(has_card)/len(model_info)*100))

Number of Models: 74797
Number of Models with Model Cards: 31938
Percentage of Models with Model Cards:42.70%


In [31]:
total_downloads = model_info['downloads'].sum()
print('Total Downloads:', total_downloads)
total_downloads_has_card = has_card['downloads'].sum()
print('Total Downloads of Models with Model Cards:', total_downloads_has_card)
print('Percentage of Downloads of Models with Model Cards:{:.2f}%'.format(total_downloads_has_card/total_downloads*100))

Total Downloads: 285548546.0
Total Downloads of Models with Model Cards: 258363746.0
Percentage of Downloads of Models with Model Cards:90.48%


## Model Number Growth

In [32]:
min_time = min(model_info['creation_time'])
print('min_time', min_time)
max_time = max(model_info['creation_time'])
print('max_time', max_time)

min_time 2018-11-14
max_time 2022-09-30


In [33]:
model_info = model_info.sort_values(by=['creation_time'])
model_info

Unnamed: 0,modelId,author,creation_time,downloads,has_modelcard,task_category,task_domain
0,bert-base-uncased,huggingface,2018-11-14,31924518.0,True,fill-mask,nlp
40,bert-large-uncased,huggingface,2018-11-14,1027405.0,True,fill-mask,nlp
11,bert-base-chinese,huggingface,2018-11-14,4094246.0,True,fill-mask,nlp
8,bert-base-cased,huggingface,2018-11-14,6643939.0,True,fill-mask,nlp
22,bert-base-multilingual-cased,huggingface,2018-11-30,1858945.0,True,fill-mask,nlp
...,...,...,...,...,...,...,...
31321,m3/m3-experiment-roberta-base-rct-sample-word-...,m3,2022-09-30,2.0,False,text-classification,nlp
27038,henryjiang/distilbert-base-uncased-finetuned-e...,henryjiang,2022-09-30,4.0,True,text-classification,nlp
44347,dhruvs00/test,dhruvs00,2022-09-30,0.0,False,unknown,unknown
34780,bigdino/bart-large-finetuned-cnn-dailymail,bigdino,2022-09-30,2.0,True,text2text-generation,nlp


In [34]:
from dateutil.relativedelta import relativedelta

time_range = []
date_list = model_info['creation_time']
min_time = min(date_list)
max_time = max(date_list)
print(min_time, max_time)
time_delta = relativedelta(days=7)
start_time = datetime.datetime(min_time.year, min_time.month, 1).date()
end_time = (datetime.datetime(max_time.year, max_time.month+1, 1).date())
time_range.append(start_time)

while True:
    start_time += time_delta
    time_range.append(start_time)
    if start_time > end_time:
        break

2018-11-14 2022-09-30


In [35]:
time_range_str = [i.strftime("%Y-%m-%d") for i in time_range]
model70k_number = []
modelcard_number = []
for time in time_range:
    model70k_number.append(len(model_info[model_info['creation_time'] < time]))
    modelcard_number.append(len(model_info[(model_info['creation_time'] < time) & (model_info['has_modelcard']==True)]))

In [36]:
model_number = pd.DataFrame(columns=['model_time', 'model_number', 'model_type'])
model_number['model_time'] = time_range_str
model_number['model_number'] = model70k_number
model_number['model_type'] = 'all models'
model_number_card_pd = pd.DataFrame(columns=['model_time', 'model_number', 'model_type'])
model_number_card_pd['model_time'] = time_range_str
model_number_card_pd['model_number'] = modelcard_number
model_number_card_pd['model_type'] = 'models with model card'
model_number = pd.concat([model_number, model_number_card_pd])
model_number

Unnamed: 0,model_time,model_number,model_type
0,2018-11-01,0,all models
1,2018-11-08,0,all models
2,2018-11-15,4,all models
3,2018-11-22,4,all models
4,2018-11-29,4,all models
...,...,...,...
201,2022-09-08,29149,models with model card
202,2022-09-15,29878,models with model card
203,2022-09-22,30638,models with model card
204,2022-09-29,31788,models with model card


In [39]:
import plotly.graph_objs as go
import numpy as np
from scipy.optimize import curve_fit

# Filter the data to include only 'All model repositories'
all_models_data = model_number[model_number['model_type'] == 'all models']

# Filter the data to start from 2020-04-30
all_models_data = all_models_data[all_models_data['model_time'] >= '2020-04-30']

# Exponential function
def exponential(x, a, b):
    return a * np.exp(b * x)

# Fit the exponential curve
x_data = np.arange(len(all_models_data))
y_data = all_models_data['model_number']
popt, _ = curve_fit(exponential, x_data, y_data)

# Create the fitted curve data
x_fit = np.linspace(0, max(x_data), 100)
y_fit = exponential(x_fit, *popt)

# Exponential formula
exp_formula = f'y = {popt[0]:.2f} * exp({popt[1]:.2f} * x)'


In [40]:
import numpy as np
from scipy.optimize import curve_fit
import pandas as pd

# Linear function
def linear_func(x, a, b):
    return a * x + b

# Fit the linear model on the logarithm of y_data
x_data = np.arange(len(y_data))
y_log_data = np.log(y_data)
popt, _ = curve_fit(linear_func, x_data, y_log_data)

# Calculate the fitted values
y_log_fit = linear_func(x_data, *popt)
y_fit = np.exp(y_log_fit)

# Exponential formula
exp_formula = f'y = {np.exp(popt[1]):.2f} * exp({popt[0]:.2f} * x)'

# Print the fitted exponential curve and the exponential formula
print("Fitted exponential curve:", y_fit)
print("Exponential formula:", exp_formula)


Fitted exponential curve: [ 1584.90676772  1634.91837287  1686.5080902   1739.72571689
  1794.62262149  1851.25179349  1909.66789445  1969.92731075
  2032.08820807  2096.21058749  2162.35634343  2230.58932335
  2300.97538944  2373.58248217  2448.48068586  2525.74229631
  2605.44189065  2687.65639926  2772.46518006  2859.95009508
  2950.19558954  3043.2887733   3139.31950496  3238.3804786
  3340.56731327  3445.97864527  3554.71622334  3666.88500691
  3782.5932674   3901.95269271  4025.07849505  4152.08952215
  4283.10837196  4418.261511    4557.67939644  4701.496602
  4849.85194788  5002.88863473  5160.75438188  5323.60156994
  5491.58738788  5664.87398475  5843.6286262   6028.02385594
  6218.23766227  6414.45364991  6616.8612172   6825.65573895
  7041.03875498  7263.2181647   7492.40842777  7728.83077109
  7972.71340238  8224.29173042  8483.80859229  8751.51448781
  9027.66782125  9312.53515084  9606.39144603  9909.5203529
 10222.21446797 10544.7756206  10877.51516438 11220.7542776
 11

In [38]:
import plotly.graph_objs as go
import numpy as np
from scipy.optimize import curve_fit
import pandas as pd

# Filter the data to include only 'All model repositories'
all_models_data = model_number[model_number['model_type'] == 'all models']

# Filter the data to start from 2020-04-30
all_models_data = all_models_data[all_models_data['model_time'] >= '2020-04-30']

# Linear function
def linear_func(x, a, b):
    return a * x + b

# Fit the linear model on the logarithm of y_data
x_data = np.arange(len(all_models_data))
y_data = all_models_data['model_number']
y_log_data = np.log(y_data)
popt, _ = curve_fit(linear_func, x_data, y_log_data)

# Calculate the fitted values
y_log_fit = linear_func(x_data, *popt)
y_fit = np.exp(y_log_fit)

# Exponential formula
exp_formula = f'y = {np.exp(popt[1]):.2f} * exp({popt[0]:.2f} * x)'

# Create the bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=all_models_data['model_time'],
    y=all_models_data['model_number'],
    name="All model repositories",
))

# Add the exponential fit curve
fig.add_trace(go.Scatter(
    x=all_models_data['model_time'],
    y=y_fit,
    mode='lines',
    name='Exponential Fit',
    line=dict(color='red', dash='dash'),
))


# Calculate the weekly growth rate and doubling time
growth_rate = (np.exp(popt[0]) - 1) * 100
doubling_time = np.log(2) / popt[0]

# Print the growth rate and doubling time
print(f"Weekly growth rate: {growth_rate:.2f}%")
print(f"Doubling time: {doubling_time:.0f} weeks")

# Add the exponential formula annotation
fig.add_annotation(
    x=0.99,
    y=0.01,
    xref="paper",
    yref="paper",
    text=f"Weekly growth rate: {growth_rate:.2f}%",
    showarrow=False,
    font=dict(size=16, color="#000000"),
    bgcolor="#ffffff",
    opacity=0.8,
)


# Customize the layout
fig.update_layout(
    autosize=False,
    width=600,
    height=400,
    font_size=14,
    font_color="black",
    xaxis_title='',
    yaxis_title='Total Number of Models',
    legend_title_text='',
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, itemsizing='constant'),
)

fig.show()


import plotly.io as pio

# Save the figure with a high resolution (dpi=300)
# pio.write_image(fig, 'Fig1_exp.jpeg', width=600, height=400, scale=10)


Weekly growth rate: 3.16%
Doubling time: 22 weeks
