# Data Preparation for NER Label Generation

This notebook prepares the CoNLL-2003 dataset for instruction tuning. We'll convert the traditional NER tags into a text generation format suitable for training Qwen 0.5B model.


In [15]:
# Install required packages
%pip install datasets transformers peft accelerate bitsandbytes trl -q


Note: you may need to restart the kernel to use updated packages.


In [16]:
# Imports
import json
import os
from datasets import load_dataset


In [17]:
# Load CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Get tag names mapping (B-PER, B-ORG, etc.)
tag_names = dataset['train'].features['ner_tags'].feature.names

print(f"Dataset loaded: Train={len(dataset['train'])}, Val={len(dataset['validation'])}, Test={len(dataset['test'])}")
print(f"Tag names: {tag_names}")
print(f"Sample tokens: {dataset['train'][0]['tokens']}")


Dataset loaded: Train=14041, Val=3250, Test=3453
Tag names: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Sample tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [18]:
# Show example: Convert BIO tags to JSON format
sample = dataset['train'][0]
tokens = sample['tokens']
ner_tags = sample['ner_tags']

# Convert tag indices to tag names (e.g., 0->O, 3->B-ORG)
tags = [tag_names[tag_id] for tag_id in ner_tags]

# Initialize result structure
result = {"PER": [], "LOC": [], "ORG": [], "MISC": []}
current_entity = None
current_type = None

# Process tokens to extract entities
for token, tag in zip(tokens, tags):
    if tag.startswith('B-') or tag.startswith('I-'):
        # Extract entity type (PER, LOC, ORG, or MISC)
        entity_type = tag.split('-')[1]
        
        if tag.startswith('B-'):
            # Beginning: save previous entity and start new one
            if current_entity:
                result[current_type].append(current_entity)
            current_type = entity_type
            current_entity = [token]
        elif tag.startswith('I-') and current_type == entity_type:
            # Inside: continue building current entity
            current_entity.append(token)
    else:
        # Outside: end current entity
        if current_entity:
            result[current_type].append(current_entity)
            current_entity = None
            current_type = None

# Handle any remaining entity at the end
if current_entity:
    result[current_type].append(current_entity)

# Create instruction
instruction = 'From TOKENS, return JSON: {"PER":[[...]],"LOC":[[...]],"ORG":[[...]],"MISC":[[...]]}; each mention is a list of input tokens; JSON only.'
input_text = str(tokens)
output_text = json.dumps(result, separators=(',', ':'))

print("Example output:")
print(f"Input: {input_text}")
print(f"Output: {output_text}")
print(f"\nResult dict: {result}")


Example output:
Input: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Output: {"PER":[],"LOC":[],"ORG":[["EU"]],"MISC":[["German"],["British"]]}

Result dict: {'PER': [], 'LOC': [], 'ORG': [['EU']], 'MISC': [['German'], ['British']]}


In [19]:
# Define instruction for the model
instruction = 'From TOKENS, return JSON: {"PER":[[...]],"LOC":[[...]],"ORG":[[...]],"MISC":[[...]]}; each mention is a list of input tokens; JSON only.'

# Helper function: Convert BIO tags to JSON format
def bio_to_json(tokens, ner_tags):
    """Convert BIO tags to JSON with entity mentions."""
    # Convert tag indices to tag names
    tags = [tag_names[tag_id] for tag_id in ner_tags]
    
    # Initialize result structure
    result = {"PER": [], "LOC": [], "ORG": [], "MISC": []}
    current_entity = None
    current_type = None
    
    # Process each token
    for token, tag in zip(tokens, tags):
        if tag.startswith('B-') or tag.startswith('I-'):
            # Extract entity type (PER, LOC, ORG, or MISC)
            entity_type = tag.split('-')[1]
            
            if tag.startswith('B-'):
                # Beginning tag: save previous and start new entity
                if current_entity:
                    result[current_type].append(current_entity)
                current_type = entity_type
                current_entity = [token]
            elif tag.startswith('I-') and current_type == entity_type:
                # Inside tag: continue current entity
                current_entity.append(token)
        else:
            # Outside tag: end current entity
            if current_entity:
                result[current_type].append(current_entity)
                current_entity = None
                current_type = None
    
    # Handle remaining entity at the end
    if current_entity:
        result[current_type].append(current_entity)
    
    return result

# Process all datasets
train_data = []
val_data = []
test_data = []

print("Processing training data...")
for example in dataset['train']:
    result = bio_to_json(example['tokens'], example['ner_tags'])
    train_data.append({
        'instruction': instruction,
        'input': str(example['tokens']),
        'output': json.dumps(result, separators=(',', ':')),
        'text': f"{instruction}\nInput: {str(example['tokens'])}\nOutput: {json.dumps(result, separators=(',', ':'))}"
    })

print("Processing validation data...")
for example in dataset['validation']:
    result = bio_to_json(example['tokens'], example['ner_tags'])
    val_data.append({
        'instruction': instruction,
        'input': str(example['tokens']),
        'output': json.dumps(result, separators=(',', ':')),
        'text': f"{instruction}\nInput: {str(example['tokens'])}\nOutput: {json.dumps(result, separators=(',', ':'))}"
    })

print("Processing test data...")
for example in dataset['test']:
    result = bio_to_json(example['tokens'], example['ner_tags'])
    test_data.append({
        'instruction': instruction,
        'input': str(example['tokens']),
        'output': json.dumps(result, separators=(',', ':')),
        'text': f"{instruction}\nInput: {str(example['tokens'])}\nOutput: {json.dumps(result, separators=(',', ':'))}"
    })

print(f"\nProcessed: Train={len(train_data)}, Val={len(val_data)}, Test={len(test_data)}")


Processing training data...
Processing validation data...
Processing test data...

Processed: Train=14041, Val=3250, Test=3453


In [20]:
# Save processed data as JSON files
os.makedirs("outputs/data", exist_ok=True)

# Save train data
with open("outputs/data/train_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)

# Save validation data
with open("outputs/data/val_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, indent=2, ensure_ascii=False)

# Save test data
with open("outputs/data/test_instruction_data.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=2, ensure_ascii=False)

print("Data saved to outputs/data/")


Data saved to outputs/data/


In [21]:
# Show sample from training data
import random
sample_idx = random.randint(0, len(train_data) - 1)
print(f"\nSample {sample_idx}:")
print(train_data[sample_idx]['text'][:200] + "...")



Sample 5735:
From TOKENS, return JSON: {"PER":[[...]],"LOC":[[...]],"ORG":[[...]],"MISC":[[...]]}; each mention is a list of input tokens; JSON only.
Input: ['0-0', '.']
Output: {"PER":[],"LOC":[],"ORG":[],"MISC":...
