In [1]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tydiqa = load_dataset("tydiqa", "primary_task")

print(f"\nTrain size: {len(tydiqa['train'])}")
print(f"Validation size: {len(tydiqa['validation'])}")


Train size: 166916
Validation size: 18670


In [3]:
train_split = tydiqa['train']

print(f"\nTotal examples: {len(train_split)}")
print(f"\nFeatures: {list(train_split.features.keys())}")


print("Language Distribution")
print("-" * 60)
language_counts = {}
for example in train_split:
    lang = example['language']
    language_counts[lang] = language_counts.get(lang, 0) + 1

for lang, count in sorted(language_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{lang}: {count} ({count/len(train_split)*100:.2f}%)")


Total examples: 166916

Features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']
Language Distribution
------------------------------------------------------------
telugu: 24558 (14.71%)
arabic: 23092 (13.83%)
swahili: 17613 (10.55%)
japanese: 16288 (9.76%)
finnish: 15285 (9.16%)
indonesian: 14952 (8.96%)
russian: 12803 (7.67%)
thai: 11365 (6.81%)
korean: 10981 (6.58%)
bengali: 10768 (6.45%)
english: 9211 (5.52%)
telugu: 24558 (14.71%)
arabic: 23092 (13.83%)
swahili: 17613 (10.55%)
japanese: 16288 (9.76%)
finnish: 15285 (9.16%)
indonesian: 14952 (8.96%)
russian: 12803 (7.67%)
thai: 11365 (6.81%)
korean: 10981 (6.58%)
bengali: 10768 (6.45%)
english: 9211 (5.52%)


In [4]:
print("Example Data Structure")
print("-" * 60)
example = train_split[0]

print("\nTop-level keys:", list(example.keys()))
print("\nAnnotations keys:", list(example['annotations'].keys()))
print("\nSample annotations structure:")

Example Data Structure
------------------------------------------------------------

Top-level keys: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']

Annotations keys: ['passage_answer_candidate_index', 'minimal_answers_start_byte', 'minimal_answers_end_byte', 'yes_no_answer']

Sample annotations structure:


In [5]:
print("SAMPLE EXAMPLES")

# Show 3 examples: one from each language (if available)
seen_languages = set()
sample_count = 0
max_samples = 3

for example in train_split:
    lang = example['language']
    if lang not in seen_languages and sample_count < max_samples:
        seen_languages.add(lang)
        sample_count += 1
        
        print(f"\n{'='*60}")
        print(f"Example {sample_count} - Language: {lang}")
        print(f"{'='*60}")
        
        print(f"\nQuestion: {example['question_text'][:200]}...")
        print(f"\nContext (first 300 chars): {example['document_plaintext'][:300]}...")
        
        # Check if there's an answer - handle different annotation structures
        annotations = example['annotations']
        
        # Try different possible keys for answer location
        if 'minimal_answer' in annotations:
            start = annotations['minimal_answer']['plaintext_start_byte']
            end = annotations['minimal_answer']['plaintext_end_byte']
        elif 'plaintext_start_byte' in annotations:
            start = annotations['plaintext_start_byte']
            end = annotations['plaintext_end_byte']
        else:
            # If structure is different, check what we have

            start = -1
            end = -1
        
        if start >= 0 and end >= 0:
            answer_text = example['document_plaintext'][start:end]
            print(f"\nAnswer: {answer_text}")
        else:
            print("\nAnswer: [No answer in passage]")
        
        print(f"\nDocument Title: {example['document_title']}")


SAMPLE EXAMPLES

Example 1 - Language: indonesian

Question: berapakah jenis ras yang ada didunia?...

Context (first 300 chars): 
transl.

Ras (dari bahasa Prancis race, yang sendirinya dari bahasa Latin radix, "akar") adalah suatu sistem klasifikasi yang digunakan untuk mengkategorikan manusia dalam populasi atau kelompok besar dan berbeda melalui ciri fenotipe, asal usul geografis, tampang jasmani dan kesukuan yang terwaris...

Answer: [No answer in passage]

Document Title: Ras manusia

Example 2 - Language: japanese

Question: 2018年アメリカで一番治安の悪い州はどこ...

Context (first 300 chars): 


デトロイト（ /dɨˈtrɔɪt/）は、アメリカ合衆国ミシガン州南東部にある都市。南北をエリー湖とヒューロン湖に挟まれており、東はカナダのウィンザー市に接する。アメリカ中西部有数の世界都市。
人口は、2000年国勢調査では951,270人、2012年では701,475人と減り続けている。デトロイトの都市圏（大都市統計地域：MSA）の人口は4,452,559人であり、全米第9位の規模。フリントなどを含めた広域都市圏（合同統計地域：CSA）の人口は5,357,538人（いずれも2000年国勢調査）で、やはり全米9位の規模である。また、失業率、貧困率が高く、犯罪都市としても知られていた。主要産...

Answer: [No answer in passage]

Document Title: デトロイト

Example 3 - Language: swahili

Question: Je,Ngami