In [1]:
import re
import os
import subprocess
from arxiv import Search, Client

In [3]:
root = os.getcwd().split('utils')[0]
DOCS_PATH = os.path.join(root, "docs", "source", "en", "model_doc")
MODELS_PATH = os.path.join(root, "src/transformers/models")

def paper_link(model_name=None, path=None):
    
    if model_name != None and not model_name.endswith(".md"):
        model_name = f"{model_name}.md"
    file_path = path or os.path.join(DOCS_PATH, f"{model_name}")

    with open(file_path, 'r', encoding="utf-8") as f:
        content = f.read()
    paper_ids = re.findall(r"https://huggingface\.co/papers/(\d+\.\d+)", content)
    paper_ids += re.findall(r"https://hf\.co/papers/(\d+\.\d+)", content)
    if len(paper_ids) == 0:
        return None, 0
    return paper_ids[0], len(set(paper_ids))

In [4]:
info = {}

docs_list = os.listdir(DOCS_PATH)
docs_list.sort()
#print(docs_list)
for docs in docs_list:
    if docs.endswith(".md"):
        paper_ids, count = paper_link(docs)
        # if count > 0:
        info[docs] = {"paper_id": paper_ids, "count": count}

no_papers = [k for k, v in info.items() if v["paper_id"] is None]
many_papers = [k for k, v in info.items() if v["count"] > 1]
single_paper = [k for k, v in info.items() if v["count"] == 1]

In [7]:
single_paper

['aimv2.md',
 'albert.md',
 'align.md',
 'altclip.md',
 'aria.md',
 'autoformer.md',
 'aya_vision.md',
 'bart.md',
 'barthez.md',
 'bartpho.md',
 'bert-generation.md',
 'bert.md',
 'bertweet.md',
 'big_bird.md',
 'bigbird_pegasus.md',
 'biogpt.md',
 'bitnet.md',
 'blenderbot-small.md',
 'blip.md',
 'bloom.md',
 'bort.md',
 'bridgetower.md',
 'bros.md',
 'byt5.md',
 'camembert.md',
 'canine.md',
 'chameleon.md',
 'chinese_clip.md',
 'clap.md',
 'clip.md',
 'clipseg.md',
 'clvp.md',
 'code_llama.md',
 'codegen.md',
 'colpali.md',
 'colqwen2.md',
 'conditional_detr.md',
 'convbert.md',
 'convnext.md',
 'convnextv2.md',
 'cpm.md',
 'ctrl.md',
 'cvt.md',
 'd_fine.md',
 'dab-detr.md',
 'dac.md',
 'data2vec.md',
 'deberta-v2.md',
 'deberta.md',
 'decision_transformer.md',
 'deepseek_v2.md',
 'deepseek_v3.md',
 'deformable_detr.md',
 'deplot.md',
 'depth_anything.md',
 'depth_pro.md',
 'deta.md',
 'detr.md',
 'dialogpt.md',
 'diffllama.md',
 'dinat.md',
 'dinov2.md',
 'dinov2_with_registers.md

In [9]:
many_papers

['audio-spectrogram-transformer.md',
 'beit.md',
 'bit.md',
 'blenderbot.md',
 'blip-2.md',
 'deit.md',
 'depth_anything_v2.md',
 'encoder-decoder.md',
 'ernie.md',
 'granite_speech.md',
 'grounding-dino.md',
 'janus.md',
 'jukebox.md',
 'layoutlmv2.md',
 'layoutxlm.md',
 'llava.md',
 'mluke.md',
 'mobilenet_v2.md',
 'mobilevit.md',
 'mobilevitv2.md',
 'mvp.md',
 'nemotron.md',
 'paligemma.md',
 'perceiver.md',
 'pop2piano.md',
 'pvt_v2.md',
 'reformer.md',
 'regnet.md',
 'roberta-prelayernorm.md',
 'sam.md',
 'seamless_m4t_v2.md',
 'tvlt.md',
 'vitpose.md',
 'wav2vec2-conformer.md']

In [26]:
with_arxiv = []

In [36]:

for doc in many_papers:
    with open(os.path.join(DOCS_PATH, doc), 'r', encoding="utf-8") as f:
        content = f.read()
    if "https://arxiv.org" in content:
        with_arxiv.append(doc)

In [35]:
with_arxiv

['t5gemma.md', 'gemma3n.md', 'mixtral.md', 'umt5.md', 'vjepa2.md', 'xmod.md']

In [37]:
with_arxiv

['t5gemma.md',
 'gemma3n.md',
 'mixtral.md',
 'umt5.md',
 'vjepa2.md',
 'xmod.md',
 'nemotron.md']

In [5]:
len(single_paper), len(no_papers), len(many_papers)

(293, 50, 34)

In [6]:
"sam_hq" in single_paper, "sam_hq" in no_papers, "sam_hq" in many_papers

(False, False, False)

In [19]:
def get_first_commit_date(model_name=None):
    
    if model_name.endswith(".md"):
        model_name = f"{model_name[:-3]}"
        
    model_name_src = model_name
    if "-" in model_name:
        model_name_src = model_name.replace("-", "_")
    file_path = os.path.join(MODELS_PATH, model_name_src, "__init__.py")
    if not os.path.exists(file_path):
        file_path = os.path.join(DOCS_PATH, f"{model_name}.md")
    result = subprocess.check_output(
        ["git", "log", "--reverse", "--pretty=format:%ad", "--date=iso", file_path],
        text=True
    )
    return result.strip().split('\n')[0][:10]
# first_result = get_first_commit_date("dinov2.md")

def get_release_date(link):
    if link.startswith("https://huggingface.co/papers/"):
        link = link.replace("https://huggingface.co/papers/", "")
    client = Client()
    search = Search(id_list=[link])
    results = list(client.results(search))
    if len(results) != 0:
        return results[0].published.date().isoformat()
    else:
        return "No info found"

In [21]:
get_first_commit_date("barthez")

'2020-11-27'

In [7]:
no_marker = []

In [9]:
with open(os.path.join(DOCS_PATH, "sam_hq.md"), "r") as f:
    content = f.read()
marker = re.finditer(r"-->", content)
list(marker)

[]

In [54]:
len("*This model was released on")

27

In [21]:
def insert_dates(docs_list):
    for doc in docs_list:
        if not doc.endswith(".md"):
            doc = f"{doc}.md"
        file_path = os.path.join(DOCS_PATH, doc)
        paper_id, count = paper_link(path=file_path)
        if paper_id is not None:
            release_date = get_release_date(paper_id)
        else:
            print("no huggingface paper link found in", doc)
            release_date = "{release_date}"
        hf_commit_date = get_first_commit_date(model_name=doc)

        with open(file_path, 'r', encoding="utf-8") as f:
            content = f.read()
        markers = list(re.finditer(r"-->", content))
        if len(markers) == 0:
            print(f"No marker found in {doc}. Skipping.")
            no_marker.append(doc)
            continue

        # if paper_id is None:
        #     continue
        if doc == "auto.md" or doc == "timm_wrapper":
            continue

        insert_index = markers[0].end()
        date_info = f"\n*This model was released on {release_date} and added to Hugging Face Transformers on {hf_commit_date}.*"

        if date_info not in content:
            pattern = r"\n\*This model was released on .* and added to Hugging Face Transformers on .*\.\*"
            
            if re.search(pattern, content):
                content = re.sub(pattern, "", content)
                content = content[:insert_index] + date_info + content[insert_index:]
                with open(file_path, 'w', encoding="utf-8") as f:
                    f.write(content)
                print(f"Updated {doc} release and commit dates.")

            else:
                content = content[:insert_index] + date_info + content[insert_index:]
                with open(file_path, 'w', encoding="utf-8") as f:
                    f.write(content)
                print(f"Added {doc} release and commit dates.")   # This if else block can be shortened if diff messages are not needed for update and add

        else:
            print(f"{doc} already has the release and commit dates.")



# for the models that have only one paper, we can add the date of the first commit
# just iterate thru the list of these models
# extract the link with paper_link()
# then get_arxiv_info(link) and get first_commit_date(model_name)
# insert the info if it already does not exist

In [35]:
insert_dates(single_paper[180:])

qwen2_moe.md already has the release and commit dates.
qwen2_vl.md already has the release and commit dates.
rag.md already has the release and commit dates.
realm.md already has the release and commit dates.
rembert.md already has the release and commit dates.
resnet.md already has the release and commit dates.
roberta.md already has the release and commit dates.
roformer.md already has the release and commit dates.
rt_detr.md already has the release and commit dates.
rt_detr_v2.md already has the release and commit dates.
No marker found in sam_hq.md. Skipping.
Added seamless_m4t.md release and commit dates.
Added segformer.md release and commit dates.
Added seggpt.md release and commit dates.
Added sew-d.md release and commit dates.
Added sew.md release and commit dates.
Added shieldgemma2.md release and commit dates.
Added siglip.md release and commit dates.
Added siglip2.md release and commit dates.
Added speech-encoder-decoder.md release and commit dates.
Added speech_to_text.md 

In [36]:
insert_dates(many_papers)

Added audio-spectrogram-transformer.md release and commit dates.
Added beit.md release and commit dates.
Added bit.md release and commit dates.
Added blenderbot.md release and commit dates.
Added blip-2.md release and commit dates.
Added deit.md release and commit dates.
Added depth_anything_v2.md release and commit dates.
Added encoder-decoder.md release and commit dates.
Added ernie.md release and commit dates.
Added granite_speech.md release and commit dates.
Added grounding-dino.md release and commit dates.
Added janus.md release and commit dates.
Added jukebox.md release and commit dates.
Added layoutlmv2.md release and commit dates.
Added layoutxlm.md release and commit dates.
Added llava.md release and commit dates.
Added mluke.md release and commit dates.
Added mobilenet_v2.md release and commit dates.
Added mobilevit.md release and commit dates.
Added mobilevitv2.md release and commit dates.
Added mvp.md release and commit dates.
Added nemotron.md release and commit dates.
Add

In [37]:
insert_dates(no_papers)

No marker found in cohere.md. Skipping.
No marker found in cohere2.md. Skipping.


In [41]:
no_marker

['sam_hq.md', 'cohere.md', 'cohere2.md']

In [6]:
no_papers

['arcee.md',
 'auto.md',
 'bamba.md',
 'bark.md',
 'bert-japanese.md',
 'cohere.md',
 'cohere2.md',
 'cpmant.md',
 'csm.md',
 'dbrx.md',
 'dia.md',
 'doge.md',
 'esm.md',
 'falcon3.md',
 'falcon_h1.md',
 'flan-ul2.md',
 'fuyu.md',
 'gemma3n.md',
 'gpt-sw3.md',
 'gpt2.md',
 'gpt_neo.md',
 'gpt_neox_japanese.md',
 'gptj.md',
 'gptsan-japanese.md',
 'granitemoehybrid.md',
 'granitevision.md',
 'helium.md',
 'imagegpt.md',
 'kyutai_speech_to_text.md',
 'lfm2.md',
 'llama4.md',
 'llava_next.md',
 'mistral3.md',
 'mixtral.md',
 'mllama.md',
 'mpt.md',
 'open-llama.md',
 'openai-gpt.md',
 'persimmon.md',
 'retribert.md',
 'roc_bert.md',
 'rwkv.md',
 'smollm3.md',
 'stablelm.md',
 'time_series_transformer.md',
 'timm_wrapper.md',
 'umt5.md',
 'vjepa2.md',
 'voxtral.md',
 'xmod.md']

In [8]:
get_first_commit_date("arcee")

'2025-06-24'

In [7]:
insert_dates(["aimv2"])

Added aimv2.md release and commit dates.


In [20]:
no_marker = []

In [21]:
insert_dates(no_papers)

no huggingface paper link found in arcee.md
Updated arcee.md release and commit dates.
no huggingface paper link found in auto.md
Added auto.md release and commit dates.
no huggingface paper link found in bamba.md
Added bamba.md release and commit dates.
no huggingface paper link found in bark.md
Updated bark.md release and commit dates.
no huggingface paper link found in bert-japanese.md
Added bert-japanese.md release and commit dates.
no huggingface paper link found in bloom.md
Added bloom.md release and commit dates.
no huggingface paper link found in cohere.md
No marker found in cohere.md. Skipping.
no huggingface paper link found in cohere2.md
No marker found in cohere2.md. Skipping.
no huggingface paper link found in cpmant.md
Added cpmant.md release and commit dates.
no huggingface paper link found in csm.md
Added csm.md release and commit dates.
no huggingface paper link found in cvt.md
Added cvt.md release and commit dates.
no huggingface paper link found in dbrx.md
Added dbrx

In [24]:
insert_dates(["seamless_m4t_v2"])

Updated seamless_m4t_v2.md release and commit dates.


In [23]:
no_marker

['cohere.md', 'cohere2.md']

In [25]:
insert_dates(["qwen2_vl"])



Updated qwen2_vl.md release and commit dates.


In [12]:
get_first_commit_date("cohere2")

'2024-12-13'

In [None]:
get_first_commit_date("swin")

'2022-01-21'