# Install required libraries

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:0

# Import required libraries

In [3]:
from datasets import load_dataset
import random

In [4]:
def split_dataset(dataset, train_ratio=0.8, seed=42):
    """
    Split a dataset into train and dev sets based on a given ratio.
    """
    # Set random seed for reproducibility
    random.seed(seed)

    # Get total number of examples
    total_examples = len(dataset)
    indices = list(range(total_examples))

    # Shuffle indices
    random.shuffle(indices)

    # Calculate split point
    split_point = int(total_examples * train_ratio)

    # Split indices
    train_indices = indices[:split_point]
    dev_indices = indices[split_point:]

    # Create train and dev datasets
    train_dataset = dataset.select(train_indices)
    dev_dataset = dataset.select(dev_indices)

    return train_dataset, dev_dataset

In [5]:
def export_to_conll(dataset, output_file, format_type='simple'):
    tag_map = {
        0: 'O',
        1: 'B-PER',
        2: 'I-PER',
        3: 'B-ORG',
        4: 'I-ORG',
        5: 'B-LOC',
        6: 'I-LOC',
        7: 'B-MISC',
        8: 'I-MISC'
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        if format_type == 'detailed':
            f.write("-DOCSTART- -X- O O\n\n")

        for example in dataset:
            tokens = example['tokens']
            ner_tags = example['ner_tags']

            for token, tag in zip(tokens, ner_tags):
                if format_type == 'simple':
                    f.write(f"{token}\t{tag_map[tag]}\n")
                elif format_type == 'detailed':
                    f.write(f"{token} -X- _ {tag_map[tag]}\n")

            f.write("\n")

In [6]:
def print_first_lines(filename, n=5):
    print(f"First {n} lines of {filename}:")
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < n:
                print(line.rstrip())
            else:
                break
    print()

In [7]:
# Main execution
if __name__ == "__main__":
    # Load the dataset
    ds = load_dataset("tomaarsen/conll2002", "es")

    # Split the training data
    train_ratio = 0.8  # You can adjust this ratio
    train_dataset, dev_dataset = split_dataset(ds['train'], train_ratio=train_ratio)

    # Export both splits in both formats
    export_to_conll(train_dataset, "train_simple.conll", format_type='simple')
    export_to_conll(train_dataset, "train_detailed.conll", format_type='detailed')
    export_to_conll(dev_dataset, "dev_simple.conll", format_type='simple')
    export_to_conll(dev_dataset, "dev_detailed.conll", format_type='detailed')

    # Print statistics
    print(f"Original dataset size: {len(ds['train'])}")
    print(f"Train set size: {len(train_dataset)} ({train_ratio*100}%)")
    print(f"Dev set size: {len(dev_dataset)} ({(1-train_ratio)*100}%)")

    # Verify outputs
    for filename in ["train_simple.conll", "train_detailed.conll",
                     "dev_simple.conll", "dev_detailed.conll"]:
        print_first_lines(filename)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


0000.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/262k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8323 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1915 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1517 [00:00<?, ? examples/s]

Original dataset size: 8323
Train set size: 6658 (80.0%)
Dev set size: 1665 (19.999999999999996%)
First 5 lines of train_simple.conll:
Hulls	B-PER
señaló	O
que	O
en	O
el	O

First 5 lines of train_detailed.conll:
-DOCSTART- -X- O O

Hulls -X- _ B-PER
señaló -X- _ O
que -X- _ O

First 5 lines of dev_simple.conll:
Intentó	O
un	O
sistema	O
de	O
centro	O

First 5 lines of dev_detailed.conll:
-DOCSTART- -X- O O

Intentó -X- _ O
un -X- _ O
sistema -X- _ O

