# Data Processing

Import all the necessary libraries

In [1]:
from transformers import DistilBertForTokenClassification
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

Import the dataset for training

In [3]:
sentence_df = pd.read_csv('/Sentences_Data.csv')

In [4]:
sentence_df

Unnamed: 0,text,label
0,Denali is North America's peak.,Denali
1,Kilimanjaro rises in Tanzania.,Kilimanjaro
2,Matterhorn graces the Alpine range.,Matterhorn
3,Pikes Peak offers stunning views.,Pikes Peak
4,Fuji dominates Japan's skyline.,Fuji
...,...,...
1444,Denali is now officially McKinley.,Denali
1445,"Shasta, spiritual landmark in California.",Shasta
1446,"Cook, New Zealand's crown jewel.",Cook
1447,"Logan, part of majestic St. Elias Range.",Logan


Define a function to preprocess the created DataFrame, tokenize the text and convert the labels into the BIO format

In [5]:
def preprocess_data(df, tokenizer):

    tokenized_texts = []
    tokenized_labels = []

    for _, row in df.iterrows():
        sentence = row['text']
        label = row['label']

        # Tokenize the sentence
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
        input_ids = tokenizer.encode(sentence, return_tensors='pt').squeeze().tolist()

        # Initialize labels with 'O' for each token
        labels = ['O'] * len(input_ids)

        # Tokenize the label and find the indices where it matches the sentence tokens
        label_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(label)))

        # Find the starting index of the label in the sentence
        start_index = tokens.index(label_tokens[0]) if label_tokens[0] in tokens else -1

        if start_index != -1:
            # Mark the starting index with 'B' (Beginning)
            labels[start_index] = 'B'

            # Mark the subsequent indices with 'I' (Inside)
            for i in range(1, len(label_tokens)):
                if start_index + i < len(labels):
                    labels[start_index + i] = 'I'

        # Append tokenized sentence and labels
        tokenized_texts.append(input_ids)
        tokenized_labels.append(labels)

    return tokenized_texts, tokenized_labels

A function to convert tokenized text and labels to model input

In [6]:
def convert_to_model_input(tokenizer, tokenized_texts, tokenized_labels, max_length=128, label2id=None):
    input_ids = []
    attention_masks = []
    new_labels = []

    for tokens, labels in zip(tokenized_texts, tokenized_labels):
        # Truncate or pad the tokens to the specified max length
        tokens = tokens[:max_length - 2]
        labels = labels[:max_length - 2]

        # Add [CLS] and [SEP] tokens
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        labels = ['O'] + labels + ['O']

        # Convert tokens and labels to input IDs
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        # Use label2id dictionary, or default to 0 for 'O' if not found
        label_id = [label2id.get(label, label2id['O']) for label in labels]

        # Generate attention mask
        attn_mask = [1] * len(input_id)

        # Pad up to the maximum length
        padding_length = max_length - len(input_id)
        input_id = input_id + [0] * padding_length
        attn_mask = attn_mask + [0] * padding_length
        label_id = label_id + [label2id['O']] * padding_length

        input_ids.append(input_id)
        attention_masks.append(attn_mask)
        new_labels.append(label_id)

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(new_labels)

Define labels mapping

In [7]:
label2id = {'O': 0, 'B': 1, 'I': 2}

Split the data for training and for validation

In [8]:
train_df, valid_df = train_test_split(sentence_df, test_size=0.1, random_state=42)

Initialize the tokenizer and add the mountain names to the vocabulary

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.add_tokens(sentence_df['label'].unique().tolist(), special_tokens=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

28

Preprocessing data for training

In [10]:
tokenized_texts, tokenized_labels = preprocess_data(train_df, tokenizer)

# Model Training

Create a dataloader for training

In [11]:
input_ids, attention_masks, labels = convert_to_model_input(tokenizer, tokenized_texts, tokenized_labels,
                                                            label2id=label2id)
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

Preprocess validation data

In [12]:
tokenized_texts_val, tokenized_labels_val = preprocess_data(valid_df, tokenizer)
input_ids_val, attention_masks_val, labels_val = convert_to_model_input(tokenizer, tokenized_texts_val,
                                                                        tokenized_labels_val, label2id=label2id)
valid_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_val)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

Initialize the model

In [13]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label2id))
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30550, 768)

Start the training process

In [None]:

optimizer = AdamW(model.parameters(), lr=3e-5)

# Set up a learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)  # You can adjust step_size and gamma as needed

for epoch in range(20):
    model.train()
    train_dataloader_iterator = tqdm(enumerate(train_dataloader), total=len(train_dataloader),
                                     desc=f"Epoch {epoch + 1}")
    for batch_num, batch in train_dataloader_iterator:
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Update the progress bar description with the current loss
        train_dataloader_iterator.set_postfix({'Loss': loss.item()}, refresh=True)

    # Step the learning rate scheduler
    scheduler.step()

    # Access the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch + 1}, Learning Rate: {current_lr}")

    # Model Validation
    model.eval()
    val_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for val_batch_num, val_batch in enumerate(valid_dataloader):
            val_inputs = {'input_ids': val_batch[0], 'attention_mask': val_batch[1], 'labels': val_batch[2]}
            val_outputs = model(**val_inputs)
            val_loss += val_outputs.loss.item()
            num_batches += 1

    avg_val_loss = val_loss / num_batches
    print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}")


Save the model and the tokenizer

In [None]:
model.save_pretrained('./ner_model')
tokenizer.save_pretrained('./ner_tokenizer')

# Model Inference

Define a function for named entity prediction

In [17]:
def tokenize_and_predict(text, model, tokenizer, label2id):
    # Tokenize input text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
    input_ids = tokenizer.encode(text, return_tensors='pt').squeeze().tolist()

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=torch.tensor([input_ids]))

    # Convert predicted label indices to labels
    predicted_labels = [list(label2id.keys())[i] for i in torch.argmax(outputs.logits, dim=2).squeeze().tolist()]

    # Extract entities from the predicted labels
    entities = []
    current_entity = {"text": "", "start": None, "end": None, "label": None}
    for i, (token, label) in enumerate(zip(tokens, predicted_labels)):
        if label == 'B':
            if current_entity["text"]:
                entities.append(current_entity)
            current_entity = {"text": token, "start": i, "end": i + 1, "label": label}
        elif label == 'I':
            current_entity["text"] += " " + token
            current_entity["end"] = i + 1

    if current_entity["text"]:
        entities.append(current_entity)

    return entities

Load the trained model and tokenizer

In [14]:
model = DistilBertForTokenClassification.from_pretrained('/content/drive/MyDrive/wrk/Task-1_Smilianets_NER/ner_model')
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/wrk/Task-1_Smilianets_NER/ner_tokenizer')
label2id = {'O': 0, 'B': 1, 'I': 2}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Set an example text

In [15]:
input_text = "Mount Everest is the highest peak in the world."

Predict entities and show results

In [18]:
predicted_entities = tokenize_and_predict(input_text, model, tokenizer, label2id)

print("Predicted Entities:")
for entity in predicted_entities:
    print(f"Text: {entity['text']}, Label: {entity['label']}, Start: {entity['start']}, End: {entity['end']}")

Predicted Entities:


As we can see, the model doesn't seem to predict entities correctly