# Listing tasks (using filters):

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd
import time

# Mapping the task list to Hugging Face task_categories
task_mapping = {
    "Q&A": "question-answering",
    "Reasoning & Multi-step Thinking": "reasoning",
    "Summarization": "summarization",
    "Cultural Alignment": "cultural-aligned",
    "Dialog/Conversation": "conversational",
    "Personal Ownership/System Prompt": "System Prompt",  
    "Robustness & Safety": "Safety",
    "Function Call": "function-call",  
    "Ethics, Bias, and Fairness": "bias-and-fairness",
    "Code Generation": "Code Generation",
    "Official Documentation": "documentation",
    "Translation": "translation"
}

all_rows = []

# Loop through each task
for user_task, hf_task in task_mapping.items():
    print(f"🔍 Processing task: {user_task} ({hf_task})")
    try:
        datasets_list = list_datasets(task_categories=hf_task, language="ar")
    except Exception as e:
        print(f" Failed to fetch for task {user_task}: {e}")
        continue

    for dataset in datasets_list:
        tags = dataset.tags if hasattr(dataset, "tags") else []

        try:
            license = [tag for tag in dataset.tags if "license" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in tags if tag.startswith("arxiv:")), "none")



        all_rows.append({
            "Task": user_task,
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        
        })

# Create and show DataFrame
df = pd.DataFrame(all_rows)
df

In [None]:
df['Task'].value_counts()

# Listing tasks (using keywords):

## Cultural Alignment Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["cultural", "culture", "cidar"]
required_tags = ["cultural-aligned"]
required_modality = "modality:text"

#  Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    #  Check for match in name or tag
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    tag_match = any(tag in dataset_tags for tag in required_tags)

    #  Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match or tag_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Tags": ", ".join(dataset_tags),
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Reasoning & Multi-step Thinking Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Reasoning", "Multi-step reasoning"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match ):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Dialog/Conversation Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Dialog", "Conversation"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match ):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Personal Ownership/System Prompt Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["system prompt", "persona"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Robustness & Safety Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Robustness", "Safety", "Toxicity", "jailbreak"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
  

    #  Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Ethics, Bias, and Fairness Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Ethics", "Bias", "Fairness"]  
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
    

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Code Generation Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["code generation"]
required_modality = "modality:text"

#  Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
  

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Official Documentation Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

# Filters
search_keywords = ["Documentation", "Official Documentation"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   
    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df

## Function Call Task

In [None]:
from huggingface_hub import list_datasets, list_models
import pandas as pd

#  Filters
search_keywords = ["Function Call"]
required_modality = "modality:text"

# Get only Arabic datasets
datasets_list = list_datasets(language="ar")

dataset_rows = []

for dataset in datasets_list:
    dataset_id = dataset.id.lower()
    dataset_tags = dataset.tags or []

    # Check for match in name 
    name_match = any(keyword.lower() in dataset_id for keyword in search_keywords)
   

    # Make sure it's a text dataset
    has_text_modality = required_modality in dataset_tags

    if has_text_modality and (name_match):
        try:
            license = [tag for tag in dataset_tags if "license:" in tag][0].split(":")[-1]
        except:
            license = "none"
        try:
            models = len(list(list_models(filter=f"dataset:{dataset.id}")))
        except:
            models = "none"

        size = next((tag.split(":")[-1] for tag in dataset_tags if tag.startswith("size:")), "unknown")
        arxiv_link = next((tag.split(":", 1)[-1] for tag in dataset_tags if tag.startswith("arxiv:")), "none")

        print(dataset.id, dataset.likes, dataset.lastModified, license, models, size, arxiv_link)

        dataset_rows.append({
            "Dataset ID": dataset.id,
            "Likes": dataset.likes,
            "Downloads": dataset.downloads,
            "Last Modified": dataset.lastModified,
            "License": license,
            "Models": models,
            "Size": size,
            "ArXiv": arxiv_link
        })

# Create and show DataFrame
df = pd.DataFrame(dataset_rows)
df