# Data Preparation for NER Label Generation

This notebook prepares the CoNLL-2003 dataset for instruction tuning. We'll convert the traditional NER tags into a text generation format suitable for training Qwen 0.5B model.


In [1]:
# Install required packages
%pip install datasets transformers peft accelerate bitsandbytes trl -q


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports
import json
import os
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
from collections import Counter


In [3]:
# Load CoNLL-2003 dataset
print("Loading CoNLL-2003 dataset...")
dataset = load_dataset("conll2003")

# Check the structure
print(f"Dataset splits: {dataset.keys()}")
print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

# Look at a sample
print("\nSample from training set:")
sample = dataset['train'][0]
print(f"Tokens: {sample['tokens']}")
print(f"NER tags: {sample['ner_tags']}")

# Tag mapping
tag_names = dataset['train'].features['ner_tags'].feature.names
print(f"\nNER tag names: {tag_names}")


Loading CoNLL-2003 dataset...


conll2003.py: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md: 0.00B [00:00, ?B/s]

Dataset splits: dict_keys(['train', 'validation', 'test'])
Train size: 14041
Validation size: 3250
Test size: 3453

Sample from training set:
Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER tags: [3, 0, 7, 0, 0, 0, 7, 0, 0]

NER tag names: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [4]:
# Create instruction format data
def create_instruction_format(example):
    """
    Convert tokens and NER tags to instruction format.
    Input: sentence
    Output: word: tag, word: tag, ...
    """
    tokens = example['tokens']
    ner_tags = example['ner_tags']
    
    # Convert tag indices to tag names
    tags = [tag_names[tag_id] for tag_id in ner_tags]
    
    # Create the input sentence
    input_text = " ".join(tokens)
    
    # Create the output format
    output_pairs = []
    for token, tag in zip(tokens, tags):
        output_pairs.append(f"{token}: {tag}")
    output_text = ", ".join(output_pairs)
    
    # Create instruction format
    instruction = "Given the following sentence, identify and label each word with its named entity tag (PER for person, LOC for location, ORG for organization, MISC for miscellaneous, or O for no entity)."
    
    return {
        'instruction': instruction,
        'input': input_text,
        'output': output_text,
        'text': f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
    }

# Process a sample
sample_processed = create_instruction_format(dataset['train'][0])
print("Processed sample:")
print(f"Instruction: {sample_processed['instruction']}")
print(f"Input: {sample_processed['input']}")
print(f"Output: {sample_processed['output'][:100]}...")  # Truncate for display


Processed sample:
Instruction: Given the following sentence, identify and label each word with its named entity tag (PER for person, LOC for location, ORG for organization, MISC for miscellaneous, or O for no entity).
Input: EU rejects German call to boycott British lamb .
Output: EU: B-ORG, rejects: O, German: B-MISC, call: O, to: O, boycott: O, British: B-MISC, lamb: O, .: O...


In [5]:
# Process all splits
train_data = []
val_data = []
test_data = []

print("Processing training data...")
for example in dataset['train']:
    train_data.append(create_instruction_format(example))

print("Processing validation data...")
for example in dataset['validation']:
    val_data.append(create_instruction_format(example))

print("Processing test data...")
for example in dataset['test']:
    test_data.append(create_instruction_format(example))

print(f"\nProcessed data sizes:")
print(f"Train: {len(train_data)}")
print(f"Validation: {len(val_data)}")
print(f"Test: {len(test_data)}")


Processing training data...
Processing validation data...
Processing test data...

Processed data sizes:
Train: 14041
Validation: 3250
Test: 3453


In [6]:
# Save processed data as JSON files
os.makedirs("outputs/data", exist_ok=True)

# Save train data
with open("outputs/data/train_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)

# Save validation data
with open("outputs/data/val_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, indent=2, ensure_ascii=False)

# Save test data
with open("outputs/data/test_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=2, ensure_ascii=False)

print("Data saved to outputs/data/")


Data saved to outputs/data/


In [7]:
# Analyze the data
# Check distribution of tags
all_tags = []
for split in [dataset['train'], dataset['validation'], dataset['test']]:
    for example in split:
        tags = [tag_names[tag_id] for tag_id in example['ner_tags']]
        all_tags.extend(tags)

tag_counter = Counter(all_tags)
print("Tag distribution:")
for tag, count in tag_counter.most_common():
    print(f"{tag}: {count} ({count/len(all_tags)*100:.2f}%)")

# Check sequence lengths
lengths = []
for example in dataset['train']:
    lengths.append(len(example['tokens']))

print(f"\nSequence length statistics:")
print(f"Min length: {min(lengths)}")
print(f"Max length: {max(lengths)}")
print(f"Average length: {sum(lengths)/len(lengths):.2f}")
print(f"95th percentile: {sorted(lengths)[int(len(lengths)*0.95)]}")


Tag distribution:
O: 250660 (83.16%)
B-LOC: 10645 (3.53%)
B-PER: 10059 (3.34%)
B-ORG: 9323 (3.09%)
I-PER: 6991 (2.32%)
I-ORG: 5290 (1.76%)
B-MISC: 5062 (1.68%)
I-MISC: 1717 (0.57%)
I-LOC: 1671 (0.55%)

Sequence length statistics:
Min length: 1
Max length: 113
Average length: 14.50
95th percentile: 37
