In [1]:
# Install the necessary libraries
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Dataset Curation


In [2]:
from datasets import get_dataset_config_names
xtreme_subsets = get_dataset_config_names("xtreme")
print("There are a total of {} combinations in XTREME Benchmark".format(len(xtreme_subsets)))

Downloading builder script:   0%|          | 0.00/37.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/593k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/105k [00:00<?, ?B/s]

There are a total of 183 combinations in XTREME Benchmark


In [3]:
# Checking the PAN-X subsets in the "XTREME" benchmark
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:5]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de']

In [4]:
from datasets import load_dataset
english = load_dataset("xtreme", name="PAN-X.en")
english

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

A language corpus in PAN-X dataset consists of 20000 train samples. So, Sampling the High resource langauge English at 50% of the PAN-X data and Low resource languages German, French, Spanish and Italian at 5% of their PAN-X capacity.

In [5]:
# Sampling the datasets simulating English as high resource language
# German, French, Spanish and Italian are low resoruce languages

from collections import defaultdict
from datasets import DatasetDict

langs = ['en','de','fr','es','it']
fracs = [0.5,0.05,0.05,0.05,0.05]

panx_raw = defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
  ds = load_dataset('xtreme',name=f"PAN-X.{lang}")

  for split in ds:
    panx_raw[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows))))

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
# Checking the number of train samples in the curated datasets
import pandas as pd
pd.DataFrame({lang : [panx_raw[lang]["train"].num_rows] for lang in langs},
             index=['Number of training examples'])

Unnamed: 0,en,de,fr,es,it
Number of training examples,10000,1000,1000,1000,1000


In [7]:
panx_raw['en']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
})

In [8]:
panx_raw['de']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [9]:
panx_raw['fr']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [10]:
panx_raw['es']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [11]:
panx_raw['it']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [12]:
panx_raw['en'].save_to_disk("english")
panx_raw['de'].save_to_disk("german")
panx_raw['fr'].save_to_disk("french")
panx_raw['es'].save_to_disk("spanish")
panx_raw['it'].save_to_disk("italian")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [13]:
from datasets import load_from_disk

panx_main = defaultdict(DatasetDict)

panx_main['en'] = load_from_disk("english")
panx_main['de'] = load_from_disk("german")
panx_main['fr'] = load_from_disk("french")
panx_main['es'] = load_from_disk("spanish")
panx_main['it'] = load_from_disk("italian")

In [20]:
panx_main['en']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
})

In [21]:
panx_main['de']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [22]:
panx_main['fr']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [23]:
panx_main['es']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [24]:
panx_main['it']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [14]:
!zip -r /content/english.zip /content/english

  adding: content/english/ (stored 0%)
  adding: content/english/test/ (stored 0%)
  adding: content/english/test/data-00000-of-00001.arrow (deflated 72%)
  adding: content/english/test/dataset_info.json (deflated 57%)
  adding: content/english/test/state.json (deflated 38%)
  adding: content/english/dataset_dict.json (deflated 5%)
  adding: content/english/train/ (stored 0%)
  adding: content/english/train/data-00000-of-00001.arrow (deflated 72%)
  adding: content/english/train/dataset_info.json (deflated 57%)
  adding: content/english/train/state.json (deflated 38%)
  adding: content/english/validation/ (stored 0%)
  adding: content/english/validation/data-00000-of-00001.arrow (deflated 72%)
  adding: content/english/validation/dataset_info.json (deflated 57%)
  adding: content/english/validation/state.json (deflated 37%)


In [15]:
!zip -r /content/german.zip /content/german

  adding: content/german/ (stored 0%)
  adding: content/german/test/ (stored 0%)
  adding: content/german/test/data-00000-of-00001.arrow (deflated 72%)
  adding: content/german/test/dataset_info.json (deflated 57%)
  adding: content/german/test/state.json (deflated 37%)
  adding: content/german/dataset_dict.json (deflated 5%)
  adding: content/german/train/ (stored 0%)
  adding: content/german/train/data-00000-of-00001.arrow (deflated 72%)
  adding: content/german/train/dataset_info.json (deflated 57%)
  adding: content/german/train/state.json (deflated 38%)
  adding: content/german/validation/ (stored 0%)
  adding: content/german/validation/data-00000-of-00001.arrow (deflated 71%)
  adding: content/german/validation/dataset_info.json (deflated 57%)
  adding: content/german/validation/state.json (deflated 38%)


In [16]:
!zip -r /content/french.zip /content/french

  adding: content/french/ (stored 0%)
  adding: content/french/test/ (stored 0%)
  adding: content/french/test/data-00000-of-00001.arrow (deflated 73%)
  adding: content/french/test/dataset_info.json (deflated 57%)
  adding: content/french/test/state.json (deflated 38%)
  adding: content/french/dataset_dict.json (deflated 5%)
  adding: content/french/train/ (stored 0%)
  adding: content/french/train/data-00000-of-00001.arrow (deflated 72%)
  adding: content/french/train/dataset_info.json (deflated 57%)
  adding: content/french/train/state.json (deflated 37%)
  adding: content/french/validation/ (stored 0%)
  adding: content/french/validation/data-00000-of-00001.arrow (deflated 73%)
  adding: content/french/validation/dataset_info.json (deflated 57%)
  adding: content/french/validation/state.json (deflated 38%)


In [17]:
!zip -r /content/spanish.zip /content/spanish

  adding: content/spanish/ (stored 0%)
  adding: content/spanish/test/ (stored 0%)
  adding: content/spanish/test/data-00000-of-00001.arrow (deflated 73%)
  adding: content/spanish/test/dataset_info.json (deflated 57%)
  adding: content/spanish/test/state.json (deflated 38%)
  adding: content/spanish/dataset_dict.json (deflated 5%)
  adding: content/spanish/train/ (stored 0%)
  adding: content/spanish/train/data-00000-of-00001.arrow (deflated 72%)
  adding: content/spanish/train/dataset_info.json (deflated 57%)
  adding: content/spanish/train/state.json (deflated 38%)
  adding: content/spanish/validation/ (stored 0%)
  adding: content/spanish/validation/data-00000-of-00001.arrow (deflated 73%)
  adding: content/spanish/validation/dataset_info.json (deflated 57%)
  adding: content/spanish/validation/state.json (deflated 38%)


In [18]:
!zip -r /content/italian.zip /content/italian

  adding: content/italian/ (stored 0%)
  adding: content/italian/test/ (stored 0%)
  adding: content/italian/test/data-00000-of-00001.arrow (deflated 72%)
  adding: content/italian/test/dataset_info.json (deflated 57%)
  adding: content/italian/test/state.json (deflated 38%)
  adding: content/italian/dataset_dict.json (deflated 5%)
  adding: content/italian/train/ (stored 0%)
  adding: content/italian/train/data-00000-of-00001.arrow (deflated 72%)
  adding: content/italian/train/dataset_info.json (deflated 57%)
  adding: content/italian/train/state.json (deflated 38%)
  adding: content/italian/validation/ (stored 0%)
  adding: content/italian/validation/data-00000-of-00001.arrow (deflated 72%)
  adding: content/italian/validation/dataset_info.json (deflated 57%)
  adding: content/italian/validation/state.json (deflated 38%)
