In [1]:
import re
import torch
from transformers import BertTokenizer
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import nltk
import numpy as np
import torch.nn as nn

In [2]:
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        line = f.readline()
        if not line:
            return None
        sentence = []
        while line and (line != "\n"):
            line = line.strip()
            sentence.append(line)
            line = f.readline()
    return sentence

In [3]:
def preprocess_text_data(train_data, train_labels, test_data, test_labels):

    # ~ Encode labels
    labels = np.unique(np.array(train_labels)).tolist() + ['UNK']
    labels_mapping = dict(zip(labels, np.arange(len(labels))))

    # Normalize
    normalized_train_data = normalize_data(train_data)

    # Get labels
    train_labels = [labels_mapping[label] for label in train_labels]

    # Get train data
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    train_data = [tokenizer(text,
                            padding='max_length',
                            max_length=512,
                            truncation=True,
                            return_tensors='pt')
                  for text in normalized_train_data]


    # TEST Related
    normalized_test_data = normalize_data(test_data)
    test_labels = ['UNK' if label not in labels else label for label in test_labels]
    test_labels = [labels_mapping[label] for label in test_labels]
    test_data = [tokenizer(text,
                           padding='max_length',
                           max_length=512,
                           truncation=True,
                           return_tensors='pt')
                 for text in normalized_test_data]

    return train_data, train_labels, test_data, test_labels


In [4]:
def normalize_data(list_of_texts):
    list_of_normalized_texts = []

    # Normalize each of the tweets
    for text in list_of_texts:
        # Lower the text
        text_lower = text.lower()

        # Remove punctuation
        text_no_punctuation = re.sub(r'[^\w\s]', '', text_lower)

        # Split to tokens
        splitted_text = text_no_punctuation.split(' ')

        # Replace numbers with <number> token
        text_number_token = ['<number>' if bool(re.search(r'\d', x)) else x for x in splitted_text]

        # Remove stop words
        text_no_stop_words = [x for x in text_number_token if x not in nltk.corpus.stopwords.words('english')]

        # Join to string
        joined_text = ' '.join(text_no_stop_words)

        # Append to the list
        list_of_normalized_texts.append(joined_text)

    return list_of_normalized_texts


In [5]:
def train(model, train_data, train_labels, test_data, test_labels, learning_rate, epochs):

    train = Dataset(train_data, train_labels)
    test = Dataset(test_data, test_labels)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=16, shuffle=True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for test_input, test_label in test_dataloader:

                test_label = test_label.to(device)
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, test_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == test_label).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(test_data): .3f} \
                | Val Accuracy: {total_acc_val / len(test_data): .3f}')


In [6]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, data, labels):

        self.labels = torch.LongTensor(labels)
        self.texts = data

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [7]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 22)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
# Load data and labels
train_data = read_data(r'./atis_data/train/seq.in')
train_labels = read_data(r'./atis_data/train/label')
test_data = read_data(r'./atis_data/test/seq.in')
test_labels = read_data(r'./atis_data/test/label')

train_data, train_labels, test_data, test_labels = preprocess_text_data(train_data, train_labels, test_data, test_labels)

EPOCHS = 10
model = BertClassifier()
LR = 1e-4

train(model, train_data, train_labels, test_data, test_labels, LR, EPOCHS)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 280/280 [06:15<00:00,  1.34s/it]


Epochs: 1 | Train Loss:  0.039                 | Train Accuracy:  0.864                 | Val Loss:  0.038                 | Val Accuracy:  0.889


100%|██████████| 280/280 [06:34<00:00,  1.41s/it]


Epochs: 2 | Train Loss:  0.017                 | Train Accuracy:  0.943                 | Val Loss:  0.031                 | Val Accuracy:  0.887


 22%|██▏       | 61/280 [01:25<05:08,  1.41s/it]