In [1]:
import json
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, concatenate_datasets
from sklearn.model_selection import train_test_split
import re
import numpy as np
from transformers import AutoModelForTokenClassification,AutoTokenizer
from transformers import pipeline
import time
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import TrainingArguments , Trainer

  from .autonotebook import tqdm as notebook_tqdm


# Function to take chatgpt given data and convert it to json

In [2]:
def pos(inp):
    out = []
    for i in inp.split():
        if i in tags.keys():
            out.append(tags[i])
        else:
            out.append(tags['O'])
    return out

def replace_words_with_adjacent(inp):
    pattern = r'\[(.*?)\]\s*\((.*?)\)'
    matches = re.findall(pattern, inp)

    for match in matches:
        words_inside_brackets = match[0]
        word_inside_parentheses = match[1]
        replacement = ' '.join([word_inside_parentheses] * words_inside_brackets.count(' ') + [word_inside_parentheses])
        inp = inp.replace(f"[{words_inside_brackets}] ({word_inside_parentheses})", f"[{replacement}] ({word_inside_parentheses})")
    inp = re.sub(r'\([^)]*\)', '', inp)
    inp = inp.replace('[', '').replace(']', '')
    inp = prefix_words(inp)
    inp = pos(inp)
    out = [key for val in inp for key, value in tags.items() if value == val]
    return (inp,out)

def tokenize(inp):
    tokens = inp.split()
    tokens = [i for i in tokens if i[0]!='(']
    tokens = [item.replace('[', '').replace(']', '') for item in tokens]
    return tokens

def prefix_words(sentence):
    words = sentence.split()
    word_counts = {}
    prefixed_words = []

    for word in words:
        if word in word_counts:
            word_counts[word] += 1
            prefixed_words.append(f"I-{word}")
        else:
            word_counts[word] = 1
            prefixed_words.append(f"B-{word}")

    return " ".join(prefixed_words)

In [3]:
with open('/workspace/ner_data.txt','r+') as f:
    inp = f.readlines()
tags={'O': 0,'B-skew' : 1,'I-skew':2,'B-price':3,'I-price':4,'B-comparison':5,'I-comparison':6}
rows = []

for index,i in enumerate(inp):
    token = tokenize(i)
    ner_ids,ner_tags = replace_words_with_adjacent(i)
    row_data = {'id':index,'token': token,'ner_ids':ner_ids,'ner_tags':ner_tags}
    rows.append(row_data)

json_object = json.dumps(rows)

with open('/workspace/new_ner_data.txt', "w") as file:
    file.write(json_object)

# Convert JSON data to dataset object for training

In [3]:
with open('/workspace/new_ner_data.txt','r+') as f:
    json_data = json.load(f)
label_names=['O','B-skew','I-skew','B-price','I-price','B-comparison','I-comparison']

# Step 2: Split the data into train-validation-test sets (80-10-10 split)
train_data, temp_data = train_test_split(json_data, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Step 3: Convert the data into a suitable format for DatasetDict
def create_dataset_from_data(data):
    dataset_dict = {
        "id": [item["id"] for item in data],
        "tokens": [item["token"] for item in data],
        "ner_tags": [item["ner_tags"] for item in data],
        "ner_ids": [item["ner_ids"] for item in data],
        # Add other features (pos_tags, chunk_tags, etc.) if available in your JSON
    }
    return Dataset.from_dict(dataset_dict)

# Step 4: Create the DatasetDict object and populate it with the converted data
raw_datasets = DatasetDict({
    "train": create_dataset_from_data(train_data),
    "validation": create_dataset_from_data(validation_data),
    "test": create_dataset_from_data(test_data),
})

# Print the number of rows in each split
print(f"Number of rows in train split: {len(raw_datasets['train'])}")
print(f"Number of rows in validation split: {len(raw_datasets['validation'])}")
print(f"Number of rows in test split: {len(raw_datasets['test'])}")


Number of rows in train split: 3848
Number of rows in validation split: 481
Number of rows in test split: 481


In [4]:
raw_datasets['train']

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'ner_ids'],
    num_rows: 3848
})

# Functions for training

In [5]:
model_checkpoint = "/workspace/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_ids"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [8]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

Map:   0% 0/3848 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100% 3848/3848 [00:00<00:00, 18313.14 examples/s]
Map: 100% 481/481 [00:00<00:00, 16498.15 examples/s]
Map: 100% 481/481 [00:00<00:00, 18692.65 examples/s]
Downloading builder script: 100% 6.34k/6.34k [00:00<00:00, 4.10MB/s]


In [9]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [10]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
args = TrainingArguments(
    output_dir="/workspace/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at /workspace/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.003532,0.998111,0.999055,0.998583,0.999626
2,No log,0.003016,0.998111,0.999055,0.998583,0.99944
3,0.110200,0.002915,0.99811,0.99811,0.99811,0.999066
4,0.110200,0.001358,0.999055,0.999055,0.999055,0.999813
5,0.002300,0.001175,0.998111,0.999055,0.998583,0.999626
6,0.002300,0.001635,0.996223,0.997164,0.996693,0.99944
7,0.001000,0.000311,1.0,1.0,1.0,1.0
8,0.001000,0.000255,1.0,1.0,1.0,1.0
9,0.000600,0.000753,0.998111,0.999055,0.998583,0.999626
10,0.000600,0.000488,0.999056,1.0,0.999528,0.999813


TrainOutput(global_step=2410, training_loss=0.023753847997960213, metrics={'train_runtime': 886.974, 'train_samples_per_second': 43.383, 'train_steps_per_second': 2.717, 'total_flos': 189876533342496.0, 'train_loss': 0.023753847997960213, 'epoch': 10.0})

# Save the model

In [12]:
trainer.save_model('/workspace/Distilbert_NER')