In [2]:
%%capture
!pip install transformers
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import pandas as pd
import torch 
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

# Read Spacy Data

In [4]:
import spacy
import en_core_web_sm
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

unique_labels = ['', 'CALLSIGN']

labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

label_all_tokens = False

def load_dataset(data_path):
    db = DocBin().from_disk(data_path)
    docs = list(db.get_docs(nlp.vocab))
    dataset = []

    for doc in docs:
        #convert doc to list of tokens
        tokens = [token.text for token in doc]
        #convert doc to list of tags
        tags = [token.ent_type_ for token in doc]
        # print the dependency tree

        utterance = []
        utterance.append(tokens)
        utterance.append(tags)

        dataset.append(utterance)
    
    return dataset


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

# Initialize Tokenizer

# Create Dataset Class 

In [5]:


def align_label(tokens, labels):
    # tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, padding='max_length', max_length=128, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):
        #for each row in the dataframe, get the text and the label
        txt = [d[0] for d in df]
        lb = [d[1] for d in df]

        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 128, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

# Split Data and Define Unique Labels

In [65]:
df_train = load_dataset("/kaggle/input/all-751515-lowercase/train.spacy")
df_val = load_dataset("/kaggle/input/all-751515-lowercase/validation.spacy")
df_test = load_dataset("/kaggle/input/all-751515-lowercase/test.spacy")

train_dataset = DataSequence(df_train)

#print the first row of the dataset and the original text
print(train_dataset[0])



# df = df[0:1000]

# labels = [i.split() for i in df['labels'].values.tolist()]
# unique_labels = set()

# for lb in labels:
#         [unique_labels.add(i) for i in lb if i not in unique_labels]
# labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
# ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

# df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
#                             [int(.8 * len(df)), int(.9 * len(df))])

({'input_ids': tensor([[    0, 47052,   366,  5901,  3934,   128,   330, 11313,  3934,   128,
         21466, 11483, 12179,  3934,   128,   611, 11278,   324,  3695,  3934,
           128, 29135,  3934,   128,  6968,  3934,   128,  1322,  3934,   128,
           459, 13286,  3934,   128,    90,  1916,  3934,   128,   642,   763,
          1999,  3934,   128, 43067,  3934,   128,   560,  3934,   128,   642,
           763,  1999,  3934,   128, 23999,  3934,   128,  1264,  3934,   128,
          7109,  3934,   128, 13664,  3934,   128, 11127, 16980,  3934,   128,
          1264,  3934,   128,  3695,  3934,   128, 22118,   352,  7305,  1438,
         44403,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1, 

# Build Model

In [66]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained("roberta-base")
        # self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

# Model Training

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [67]:

def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=2, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=2, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 16
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val)

100%|██████████| 985/985 [00:53<00:00, 18.38it/s]


Epochs: 1 | Loss:  0.868 | Accuracy:  0.553 | Val_Loss:  0.657 | Accuracy:  0.608


100%|██████████| 985/985 [00:53<00:00, 18.31it/s]


Epochs: 2 | Loss:  0.659 | Accuracy:  0.606 | Val_Loss:  0.619 | Accuracy:  0.651


100%|██████████| 985/985 [00:53<00:00, 18.35it/s]


Epochs: 3 | Loss:  0.645 | Accuracy:  0.620 | Val_Loss:  0.588 | Accuracy:  0.657


100%|██████████| 985/985 [00:53<00:00, 18.40it/s]


Epochs: 4 | Loss:  0.610 | Accuracy:  0.629 | Val_Loss:  0.635 | Accuracy:  0.608


100%|██████████| 985/985 [00:53<00:00, 18.42it/s]


Epochs: 5 | Loss:  0.524 | Accuracy:  0.716 | Val_Loss:  0.667 | Accuracy:  0.608


100%|██████████| 985/985 [00:53<00:00, 18.37it/s]


Epochs: 6 | Loss:  0.325 | Accuracy:  0.859 | Val_Loss:  0.229 | Accuracy:  0.903


100%|██████████| 985/985 [00:53<00:00, 18.45it/s]


Epochs: 7 | Loss:  0.237 | Accuracy:  0.903 | Val_Loss:  0.182 | Accuracy:  0.927


100%|██████████| 985/985 [00:53<00:00, 18.39it/s]


Epochs: 8 | Loss:  0.180 | Accuracy:  0.930 | Val_Loss:  0.196 | Accuracy:  0.925


100%|██████████| 985/985 [00:53<00:00, 18.43it/s]


Epochs: 9 | Loss:  0.157 | Accuracy:  0.939 | Val_Loss:  0.151 | Accuracy:  0.942


100%|██████████| 985/985 [00:53<00:00, 18.36it/s]


Epochs: 10 | Loss:  0.137 | Accuracy:  0.946 | Val_Loss:  0.154 | Accuracy:  0.945


100%|██████████| 985/985 [00:53<00:00, 18.40it/s]


Epochs: 11 | Loss:  0.129 | Accuracy:  0.952 | Val_Loss:  0.155 | Accuracy:  0.951


100%|██████████| 985/985 [00:53<00:00, 18.43it/s]


Epochs: 12 | Loss:  0.112 | Accuracy:  0.958 | Val_Loss:  0.142 | Accuracy:  0.946


100%|██████████| 985/985 [00:53<00:00, 18.46it/s]


Epochs: 13 | Loss:  0.100 | Accuracy:  0.963 | Val_Loss:  0.142 | Accuracy:  0.952


100%|██████████| 985/985 [00:53<00:00, 18.50it/s]


Epochs: 14 | Loss:  0.089 | Accuracy:  0.967 | Val_Loss:  0.172 | Accuracy:  0.951


100%|██████████| 985/985 [00:53<00:00, 18.39it/s]


Epochs: 15 | Loss:  0.087 | Accuracy:  0.968 | Val_Loss:  0.143 | Accuracy:  0.952


100%|██████████| 985/985 [00:53<00:00, 18.42it/s]


Epochs: 16 | Loss:  0.077 | Accuracy:  0.973 | Val_Loss:  0.154 | Accuracy:  0.954


## Save/Load Model

In [9]:
model.eval()
torch.save(model, "model.pt")

In [None]:
model2 = torch.load("model.pt", map_location=torch.device('cuda'))
model2.eval()

# Evaluate Model

In [69]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    iterator = 0

    for test_data, test_label in test_dataloader:
            

        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)

        input_id = test_data['input_ids'].squeeze(1).to(device)

        loss, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):

            logits_clean = logits[i][test_label[i] != -100]
            label_clean = test_label[i][test_label[i] != -100]

            predictions = logits_clean.argmax(dim=1)
            acc = (predictions == label_clean).float().mean()
            total_acc_test += acc
            if iterator < 10:
              print("-----------------")
              print('Predictions: ', predictions)
              print('Labels: ', label_clean)
              print('Accuracy: ', acc)
              iterator += 1

        iterator += 1

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(model, df_test)

-----------------
Predictions:  tensor([1, 1, 1, 1, 1, 0, 0, 0], device='cuda:0')
Labels:  tensor([1, 1, 1, 1, 1, 0, 0, 0], device='cuda:0')
Accuracy:  tensor(1., device='cuda:0')
-----------------
Predictions:  tensor([0, 0, 0, 1, 1, 1, 1, 1], device='cuda:0')
Labels:  tensor([0, 0, 0, 1, 1, 1, 1, 1], device='cuda:0')
Accuracy:  tensor(1., device='cuda:0')
-----------------
Predictions:  tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Labels:  tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Accuracy:  tensor(1., device='cuda:0')
-----------------
Predictions:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], device='cuda:0')
Labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], device='cuda:0')
Accuracy:  tensor(1., device='cuda:0')
-----------------
Predictions:  tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Labels:  tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Accuracy:  tensor(0.9231, device='cud

# Predict One Sentence

In [75]:
def evaluate_return(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    
    results = []

    for test_data, test_label in test_dataloader:
            

        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)

        input_id = test_data['input_ids'].squeeze(1).to(device)

        loss, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):
            
            result = []

            logits_clean = logits[i][test_label[i] != -100]
            label_clean = test_label[i][test_label[i] != -100]

            predictions = logits_clean.argmax(dim=1)
            
            original_sentence = ""
            for d in df_test:
                original_sentence = ' '.join(d[0])
            
            result.append(original_sentence)
            result.append(predictions)
            
            results.append(result)
    return results

def evaluate_single(model,text):
    tokens = text.split()
    tags=[''] * len(tokens)

    test_data = []
    test_data.append(tokens)
    test_data.append(tags)

    test_df = []
    test_df.append(test_data)

    return evaluate_return(model, test_df)[0]

print(evaluate_single(model,"Foxtrot Charlie Kilo Two Charlie Bravo Descent Level Two Hundred"))

['Foxtrot Charlie Kilo Two Charlie Bravo Descent Level Two Hundred', tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')]
