In [None]:
!pip install transformers datasets torch flask accelerate fsspec==2025.3.2

In [4]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloa

In [7]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=cad2a5cb8ce6e2d5b38ac6ce6283d25009bd25f3d85d4813d83402b255760f22
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [19]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline

# Load your data
with open('training_600.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Build label list
all_labels = set()
for item in data:
     if 'labels' in item:
        for ent in item['labels']:
            all_labels.add(ent['type'])
label_types = sorted(list(all_labels))
bio_labels = ['O'] + [f'B-{l}' for l in label_types] + [f'I-{l}' for l in label_types]
label2id = {l: i for i, l in enumerate(bio_labels)}
id2label = {i: l for l, i in label2id.items()}

# Helper to create BIO tags for a sentence
def create_bio_tags(sentence, entities):
    words = sentence.split()
    tags = ['O'] * len(words)
    for ent in entities:
        ent_words = ent['text'].split()
        for i in range(len(words) - len(ent_words) + 1):
            if words[i:i+len(ent_words)] == ent_words:
                tags[i] = f'B-{ent["type"]}'
                for j in range(1, len(ent_words)):
                    tags[i+j] = f'I-{ent["type"]}'
    return tags

# Prepare examples
# ...existing code...
examples = []
for item in data:
    if 'labels' in item:
        words = item['queries'].split()
        tags = create_bio_tags(item['queries'], item['labels'])
        examples.append({'tokens': words, 'ner_tags': [label2id[tag] for tag in tags]})
# ...existing code...

# Create Hugging Face Dataset
dataset = Dataset.from_dict({
    'tokens': [ex['tokens'] for ex in examples],
    'ner_tags': [ex['ner_tags'] for ex in examples]
})

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding='max_length', max_length=64)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(bio_labels),
    id2label=id2label,
    label2id=label2id
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training arguments
args = TrainingArguments(
    output_dir="ner_out",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="ner_logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train
trainer.train()

# Save model and tokenizer
model.save_pretrained("my_trained_ner_model")
tokenizer.save_pretrained("my_trained_ner_model")

# Inference pipeline
ner_pipeline = pipeline("ner", model="my_trained_ner_model", tokenizer="my_trained_ner_model", aggregation_strategy="simple")

# Example prediction
test_sentence = "Alice Smith adopted a cat named Leo in Paris."
result = ner_pipeline(test_sentence)
print(result)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


Device set to use cuda:0


[{'entity_group': 'PERSON', 'score': 0.9954779, 'word': 'alice smith', 'start': 0, 'end': 11}, {'entity_group': 'ANIMAL', 'score': 0.9920065, 'word': 'cat', 'start': 22, 'end': 25}]


In [20]:
# Inference pipeline
ner_pipeline = pipeline("ner", model="my_trained_ner_model", tokenizer="my_trained_ner_model", aggregation_strategy="simple")

# Example queries covering all entity types
example_queries = [
    "Alice Smith adopted a cat named Leo in Paris.",  # PERSON, ANIMAL, CITY
    "Tesla launched its new Taj Mahal in New York.",  # ORG, THING, CITY
    "Emma Davis, originally from Australia, now lives in Paris.",  # PERSON, COUNTRY, CITY
    "A wild lion was spotted near the Great Wall of China in Canada.",  # ANIMAL, THING, COUNTRY
    "During the Comic-Con, John Doe gave a speech at the Great Wall of China.",  # EVENT, PERSON, THING
    "The Amazon headquarters are located in Paris, Italy.",  # ORG, CITY, COUNTRY
    "In Japan, people celebrate Olympic Games with great enthusiasm.",  # COUNTRY, EVENT
    "The Statue of Liberty has become a symbol of New York's history.",  # THING, CITY
    "Researchers at IBM discovered a new species of dolphin in Italy.",  # ORG, ANIMAL, COUNTRY
    "Every year, the Cannes Festival is held in Sydney, attracting visitors worldwide."  # EVENT, CITY
]

for query in example_queries:
    result = ner_pipeline(query)
    print(f"Query: {query}")
    if result:
        for ent in result:
            print(f"  Entity: '{ent['word']}' | Type: {ent['entity_group']} | Score: {ent['score']:.2f}")
    else:
        print("  No entities found.")
    print("-")


Device set to use cuda:0


Query: Alice Smith adopted a cat named Leo in Paris.
  Entity: 'alice smith' | Type: PERSON | Score: 1.00
  Entity: 'cat' | Type: ANIMAL | Score: 0.99
-
Query: Tesla launched its new Taj Mahal in New York.
  Entity: 'tesla' | Type: ORG | Score: 0.99
  Entity: 'ta' | Type: THING | Score: 0.99
  Entity: '##j mahal' | Type: THING | Score: 0.92
-
Query: Emma Davis, originally from Australia, now lives in Paris.
  No entities found.
-
Query: A wild lion was spotted near the Great Wall of China in Canada.
  Entity: 'lion' | Type: ANIMAL | Score: 0.99
  Entity: 'great wall of china' | Type: THING | Score: 0.99
-
Query: During the Comic-Con, John Doe gave a speech at the Great Wall of China.
  Entity: 'john doe' | Type: PERSON | Score: 1.00
-
Query: The Amazon headquarters are located in Paris, Italy.
  Entity: 'amazon' | Type: ORG | Score: 0.99
-
Query: In Japan, people celebrate Olympic Games with great enthusiasm.
  Entity: 'olympic games' | Type: EVENT | Score: 0.99
-
Query: The Statue of 