# Listing tasks (using filters):

In [13]:
from huggingface_hub import list_datasets, list_models
import pandas as pd
import time

# Mapping the task list to Hugging Face task_categories
task_mapping = {
    "Q&A": "question-answering",
    "Reasoning & Multi-step Thinking": "reasoning",
    "Summarization": "summarization",
    "Cultural Alignment": "cultural-aligned",
    "Dialog/Conversation": "conversational",
    "Personal Ownership/System Prompt": "System Prompt",  
    "Robustness & Safety": "Safety",
    "Function Call": "function-call",  
    "Ethics, Bias, and Fairness": "bias-and-fairness",
    "Code Generation": "Code Generation",
    "Official Documentation": "documentation",
    "Translation": "translation"
}

all_rows = []

# Loop through each task
for user_task, hf_task in task_mapping.items():
    print(f"🔍 Processing task: {user_task} ({hf_task})")
    try:
        datasets_list = list_datasets(task_categories=hf_task, language="ar")
    except Exception as e:
        print(f" Failed to fetch for task {user_task}: {e}")
        continue

    for dataset in datasets_list:
        tags = dataset.tags if hasattr(dataset, "tags") else []

        try:
            license = [tag for tag in dataset.tags if "license" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in tags if tag.startswith("arxiv:")), "none")



        all_rows.append({
            "Task": user_task,
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        
        })

# Create and show DataFrame
df = pd.DataFrame(all_rows)
df


🔍 Processing task: Q&A (question-answering)
🔍 Processing task: Reasoning & Multi-step Thinking (reasoning)
🔍 Processing task: Summarization (summarization)
🔍 Processing task: Cultural Alignment (cultural-aligned)
🔍 Processing task: Dialog/Conversation (conversational)
🔍 Processing task: Personal Ownership/System Prompt (System Prompt)
🔍 Processing task: Robustness & Safety (Safety)
🔍 Processing task: Function Call (function-call)
🔍 Processing task: Ethics, Bias, and Fairness (bias-and-fairness)
🔍 Processing task: Code Generation (Code Generation)
🔍 Processing task: Official Documentation (documentation)
🔍 Processing task: Translation (translation)


Unnamed: 0,Task,Dataset ID,Likes,Downloads,Last Modified,License,Models,Size,ArXiv
0,Q&A,openai/MMMLU,486,5173,2024-10-16 18:39:00+00:00,mit,94,unknown,2009.03300
1,Q&A,google-research-datasets/tydiqa,31,1488,2024-08-08 05:57:11+00:00,apache-2.0,4,unknown,none
2,Q&A,neulab/PangeaInstruct,84,522,2025-02-02 16:40:32+00:00,apache-2.0,6,unknown,2410.16153
3,Q&A,hsseinmz/arcd,7,300,2024-01-09 12:44:24+00:00,mit,1,unknown,none
4,Q&A,mhardalov/exams,34,1320,2024-02-06 07:20:12+00:00,cc-by-sa-4.0,3,unknown,2011.03080
...,...,...,...,...,...,...,...,...,...
367,Translation,taha-alnasser/ArzEn-CodeMixed,0,19,2025-05-09 03:38:16+00:00,afl-3.0,0,unknown,none
368,Translation,Hamzah-Asadullah/TinyDS-20k,1,131,2025-06-09 09:22:53+00:00,mit,0,unknown,none
369,Translation,Groovy-123/deep-think,1,77,2025-05-22 06:06:36+00:00,other,1,unknown,none
370,Translation,liboaccn/nmt-parallel-corpus,1,74,2025-06-01 14:50:19+00:00,cc-by-nc-4.0,0,unknown,2505.14256


In [14]:
df['Task'].value_counts()

Task
Q&A              167
Translation      157
Summarization     48
Name: count, dtype: int64

# Listing tasks (using keywords):

## Cultural Alignment Task

In [15]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["cultural", "culture", "cidar"]
required_tags = ["cultural-aligned"]
required_modality = "modality:text"

#  Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    #  Check for match in name or tag
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    tag_match = any(tag in dataset_tags for tag in required_tags)

    #  Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match or tag_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Tags": ", ".join(dataset_tags),
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

MBZUAI/ArabCulture 11 2025-05-23 08:38:06+00:00 cc-by-nc-sa-4.0 0 unknown 2502.12788
FreedomIntelligence/ACVA-Arabic-Cultural-Value-Alignment 8 2023-09-21 12:39:18+00:00 apache-2.0 2 unknown none
arbml/CIDAR 50 2025-04-03 08:35:36+00:00 apache-2.0 3 unknown 2402.03177
arbml/CIDAR-EVAL-100 2 2024-02-14 15:46:10+00:00 apache-2.0 0 unknown 2402.03177
arbml/CIDAR-MCQ-100 4 2024-04-02 14:48:39+00:00 apache-2.0 0 unknown 2402.03177
QCRI/MultiNativQA 0 2024-10-25 10:59:30+00:00 cc-by-nc-sa-4.0 0 unknown 2407.09823
Omartificial-Intelligence-Space/ILMAAM-Arabic-Culturally-Aligned-MMLU 2 2025-02-20 17:22:43+00:00 apache-2.0 0 unknown none
HabibaAbderrahim/Tunisian-Proverbs-with-Image-Associations-A-Cultural-and-Linguistic-Dataset 0 2025-05-07 09:16:57+00:00 cc-by-4.0 0 unknown none
QCRI/SpokenNativQA 0 2025-05-29 23:03:37+00:00 cc-by-nc-sa-4.0 0 unknown 2505.19163


Unnamed: 0,Dataset ID,Likes,Downloads,Last Modified,License,Models,Tags,Size,ArXiv
0,MBZUAI/ArabCulture,11,822,2025-05-23 08:38:06+00:00,cc-by-nc-sa-4.0,0,"task_categories:multiple-choice, task_categori...",unknown,2502.12788
1,FreedomIntelligence/ACVA-Arabic-Cultural-Value...,8,132,2023-09-21 12:39:18+00:00,apache-2.0,2,"language:ar, license:apache-2.0, size_categori...",unknown,none
2,arbml/CIDAR,50,261,2025-04-03 08:35:36+00:00,apache-2.0,3,"task_categories:text-generation, language:ar, ...",unknown,2402.03177
3,arbml/CIDAR-EVAL-100,2,24,2024-02-14 15:46:10+00:00,apache-2.0,0,"task_categories:text-generation, language:ar, ...",unknown,2402.03177
4,arbml/CIDAR-MCQ-100,4,69,2024-04-02 14:48:39+00:00,apache-2.0,0,"task_categories:multiple-choice, language:ar, ...",unknown,2402.03177
5,QCRI/MultiNativQA,0,293,2024-10-25 10:59:30+00:00,cc-by-nc-sa-4.0,0,"task_categories:question-answering, language:a...",unknown,2407.09823
6,Omartificial-Intelligence-Space/ILMAAM-Arabic-...,2,31,2025-02-20 17:22:43+00:00,apache-2.0,0,"language:ar, license:apache-2.0, size_categori...",unknown,none
7,HabibaAbderrahim/Tunisian-Proverbs-with-Image-...,0,546,2025-05-07 09:16:57+00:00,cc-by-4.0,0,"task_categories:text2text-generation, task_cat...",unknown,none
8,QCRI/SpokenNativQA,0,142,2025-05-29 23:03:37+00:00,cc-by-nc-sa-4.0,0,"task_categories:question-answering, language:a...",unknown,2505.19163


## Reasoning & Multi-step Thinking Task

In [16]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Reasoning", "Multi-step reasoning"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match ):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

beetleware/arabic-reasoning-dataset-logic 8 2025-05-21 11:02:11+00:00 mit 0 unknown none
MohammedNasser/ARabic_Reasoning_QA 5 2024-09-07 23:00:07+00:00 apache-2.0 1 unknown none
MohammedNasser/Arabic_Reasoning_Instruct_QA 2 2024-09-10 07:30:48+00:00 apache-2.0 1 unknown none
Omartificial-Intelligence-Space/Arabic_Reasoning_Dataset 9 2024-12-01 08:13:05+00:00 apache-2.0 4 unknown none
lightblue/reasoning-multilingual-R1-Llama-70B-train 36 2025-01-31 07:04:20+00:00 apache-2.0 21 unknown none
Jr23xd23/Arabic-Optimized-Reasoning-Dataset 2 2025-02-25 13:29:24+00:00 apache-2.0 0 unknown none
Pinkstack/OpenHumanreasoning-multilingual-2.2k 2 2025-03-03 17:16:04+00:00 apache-2.0 0 unknown none
miscovery/Math_CoT_Arabic_English_Reasoning 16 2025-05-12 00:14:13+00:00 mit 1 unknown none


Unnamed: 0,Dataset ID,Likes,Downloads,Last Modified,License,Models,Size,ArXiv
0,beetleware/arabic-reasoning-dataset-logic,8,73,2025-05-21 11:02:11+00:00,mit,0,unknown,none
1,MohammedNasser/ARabic_Reasoning_QA,5,13,2024-09-07 23:00:07+00:00,apache-2.0,1,unknown,none
2,MohammedNasser/Arabic_Reasoning_Instruct_QA,2,43,2024-09-10 07:30:48+00:00,apache-2.0,1,unknown,none
3,Omartificial-Intelligence-Space/Arabic_Reasoni...,9,112,2024-12-01 08:13:05+00:00,apache-2.0,4,unknown,none
4,lightblue/reasoning-multilingual-R1-Llama-70B-...,36,102,2025-01-31 07:04:20+00:00,apache-2.0,21,unknown,none
5,Jr23xd23/Arabic-Optimized-Reasoning-Dataset,2,34,2025-02-25 13:29:24+00:00,apache-2.0,0,unknown,none
6,Pinkstack/OpenHumanreasoning-multilingual-2.2k,2,35,2025-03-03 17:16:04+00:00,apache-2.0,0,unknown,none
7,miscovery/Math_CoT_Arabic_English_Reasoning,16,264,2025-05-12 00:14:13+00:00,mit,1,unknown,none


## Dialog/Conversation Task

In [17]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Dialog", "Conversation"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match ):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

mohamedemam/Arabic-samsum-dialogsum 1 2023-09-11 14:35:29+00:00 cc-by-nc-2.0 1 unknown 1911.12237
m-ric/Open_Assistant_Conversation_Chains 6 2023-11-22 14:37:58+00:00 apache-2.0 0 unknown none
premio-ai/TheArabicPile_Conversational 1 2024-03-21 21:42:33+00:00 cc-by-nc-4.0 0 unknown none
Mars203020/arabic_medical_dialogue 3 2024-06-29 11:49:41+00:00 mit 0 unknown none
willwade/AACConversations 0 2025-05-15 10:37:50+00:00 cc-by-4.0 0 unknown none


Unnamed: 0,Dataset ID,Likes,Downloads,Last Modified,License,Models,Size,ArXiv
0,mohamedemam/Arabic-samsum-dialogsum,1,41,2023-09-11 14:35:29+00:00,cc-by-nc-2.0,1,unknown,1911.12237
1,m-ric/Open_Assistant_Conversation_Chains,6,40,2023-11-22 14:37:58+00:00,apache-2.0,0,unknown,none
2,premio-ai/TheArabicPile_Conversational,1,29,2024-03-21 21:42:33+00:00,cc-by-nc-4.0,0,unknown,none
3,Mars203020/arabic_medical_dialogue,3,111,2024-06-29 11:49:41+00:00,mit,0,unknown,none
4,willwade/AACConversations,0,63,2025-05-15 10:37:50+00:00,cc-by-4.0,0,unknown,none


## Personal Ownership/System Prompt Task

In [18]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["system prompt", "persona"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Robustness & Safety Task

In [19]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Robustness", "Safety", "Toxicity", "jailbreak"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
  

    #  Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

textdetox/multilingual_toxicity_dataset 27 2025-03-21 18:52:31+00:00 openrail++ 8 unknown none
ToxicityPrompts/PolygloToxicityPrompts 11 2024-05-16 07:02:28+00:00 none 0 unknown 2405.09373
luizapzbn/from-one-to-many-toxicity-mitigation 0 2024-05-24 17:09:53+00:00 apache-2.0 0 unknown 2403.03893
textdetox/multilingual_toxicity_explained 1 2025-02-04 21:03:23+00:00 openrail++ 1 unknown 2412.11691
ToxicityPrompts/PolyGuardMix 1 2025-05-16 23:35:00+00:00 cc-by-4.0 0 unknown 2504.04377
ToxicityPrompts/PolyGuardPrompts 0 2025-04-18 03:21:04+00:00 none 0 unknown 2504.04377
Malikeh1375/tokenizer-robustness-mmlu 0 2025-05-25 04:40:10+00:00 cc-by-4.0 0 unknown none
Malikeh1375/code-switching-tokenizer-robustness 1 2025-05-24 04:28:43+00:00 cc-by-4.0 0 unknown none
gravitee-io/textdetox-multilingual-toxicity-dataset 0 2025-05-27 12:10:42+00:00 openrail++ 1 unknown none


Unnamed: 0,Dataset ID,Likes,Downloads,Last Modified,License,Models,Size,ArXiv
0,textdetox/multilingual_toxicity_dataset,27,797,2025-03-21 18:52:31+00:00,openrail++,8,unknown,none
1,ToxicityPrompts/PolygloToxicityPrompts,11,261,2024-05-16 07:02:28+00:00,none,0,unknown,2405.09373
2,luizapzbn/from-one-to-many-toxicity-mitigation,0,193,2024-05-24 17:09:53+00:00,apache-2.0,0,unknown,2403.03893
3,textdetox/multilingual_toxicity_explained,1,129,2025-02-04 21:03:23+00:00,openrail++,1,unknown,2412.11691
4,ToxicityPrompts/PolyGuardMix,1,231,2025-05-16 23:35:00+00:00,cc-by-4.0,0,unknown,2504.04377
5,ToxicityPrompts/PolyGuardPrompts,0,134,2025-04-18 03:21:04+00:00,none,0,unknown,2504.04377
6,Malikeh1375/tokenizer-robustness-mmlu,0,309,2025-05-25 04:40:10+00:00,cc-by-4.0,0,unknown,none
7,Malikeh1375/code-switching-tokenizer-robustness,1,249,2025-05-24 04:28:43+00:00,cc-by-4.0,0,unknown,none
8,gravitee-io/textdetox-multilingual-toxicity-da...,0,78,2025-05-27 12:10:42+00:00,openrail++,1,unknown,none


## Ethics, Bias, and Fairness Task

In [20]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Ethics", "Bias", "Fairness"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

LanguageShades/BiasShades 16 2025-05-03 23:25:04+00:00 none 0 unknown none


Unnamed: 0,Dataset ID,Likes,Downloads,Last Modified,License,Models,Size,ArXiv
0,LanguageShades/BiasShades,16,203,2025-05-03 23:25:04+00:00,none,0,unknown,none


## Code Generation Task

In [21]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["code generation"]
required_modality = "modality:text"

#  Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
  

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Official Documentation Task

In [22]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Documentation", "Official Documentation"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   
    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Function Call Task

In [23]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["Function Call"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df