<a href="https://colab.research.google.com/github/Tus4ar819/delta-ai-model-training/blob/main/delta_ai_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch flask accelerate fsspec==2025.3.2

In [None]:
!pip install datasets

In [None]:
!pip install transformers datasets seqeval

In [None]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline

# Load your data
with open('training_600.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Build label list
all_labels = set()
for item in data:
     if 'labels' in item:
        for ent in item['labels']:
            all_labels.add(ent['type'])
label_types = sorted(list(all_labels))
bio_labels = ['O'] + [f'B-{l}' for l in label_types] + [f'I-{l}' for l in label_types]
label2id = {l: i for i, l in enumerate(bio_labels)}
id2label = {i: l for l, i in label2id.items()}

# Helper to create BIO tags for a sentence
def create_bio_tags(sentence, entities):
    words = sentence.split()
    tags = ['O'] * len(words)
    for ent in entities:
        ent_words = ent['text'].split()
        for i in range(len(words) - len(ent_words) + 1):
            if words[i:i+len(ent_words)] == ent_words:
                tags[i] = f'B-{ent["type"]}'
                for j in range(1, len(ent_words)):
                    tags[i+j] = f'I-{ent["type"]}'
    return tags

# Prepare examples
# ...existing code...
examples = []
for item in data:
    if 'labels' in item:
        words = item['queries'].split()
        tags = create_bio_tags(item['queries'], item['labels'])
        examples.append({'tokens': words, 'ner_tags': [label2id[tag] for tag in tags]})
# ...existing code...

# Create Hugging Face Dataset
dataset = Dataset.from_dict({
    'tokens': [ex['tokens'] for ex in examples],
    'ner_tags': [ex['ner_tags'] for ex in examples]
})

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding='max_length', max_length=64)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(bio_labels),
    id2label=id2label,
    label2id=label2id
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training arguments
args = TrainingArguments(
    output_dir="ner_out",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="ner_logs",
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train
trainer.train()

# Save model and tokenizer
model.save_pretrained("my_trained_ner_model")
tokenizer.save_pretrained("my_trained_ner_model")

# Inference pipeline
ner_pipeline = pipeline("ner", model="my_trained_ner_model", tokenizer="my_trained_ner_model", aggregation_strategy="simple")

# Example prediction
test_sentence = "Alice Smith adopted a cat named Leo in Paris."
result = ner_pipeline(test_sentence)
print(result)

In [None]:
# Inference pipeline
ner_pipeline = pipeline("ner", model="my_trained_ner_model", tokenizer="my_trained_ner_model", aggregation_strategy="simple")

# Example queries covering all entity types
example_queries = [
    "Alice Smith adopted a cat named Leo in Paris.",  # PERSON, ANIMAL, CITY
    "Tesla launched its new Taj Mahal in New York.",  # ORG, THING, CITY
    "Emma Davis, originally from Australia, now lives in Paris.",  # PERSON, COUNTRY, CITY
    "A wild lion was spotted near the Great Wall of China in Canada.",  # ANIMAL, THING, COUNTRY
    "During the Comic-Con, John Doe gave a speech at the Great Wall of China.",  # EVENT, PERSON, THING
    "The Amazon headquarters are located in Paris, Italy.",  # ORG, CITY, COUNTRY
    "In Japan, people celebrate Olympic Games with great enthusiasm.",  # COUNTRY, EVENT
    "The Statue of Liberty has become a symbol of New York's history.",  # THING, CITY
    "Researchers at IBM discovered a new species of dolphin in Italy.",  # ORG, ANIMAL, COUNTRY
    "Every year, the Cannes Festival is held in Sydney, attracting visitors worldwide."  # EVENT, CITY
]

for query in example_queries:
    result = ner_pipeline(query)
    print(f"Query: {query}")
    if result:
        for ent in result:
            print(f"  Entity: '{ent['word']}' | Type: {ent['entity_group']} | Score: {ent['score']:.2f}")
    else:
        print("  No entities found.")
    print("-")
