#### Import Packages

In [30]:
import transformers
import numpy as np
import torch
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Dropout


#### Load Data Labels

In [31]:
##label dictionary
f = open('atis/intent_label.txt', 'r', encoding="utf8") # opening a file
labels = f.readlines()
labels = [str(i)[:-1] for i in labels]
num_labels = len(labels)
labels = dict(zip(labels, range(0, 22)))

# load train text
f = open('atis/train/seq.in')
content = f.readlines()
train_text = [str(x)[:-1] for x in content]
f = open('atis/train/label')
content = f.readlines()
train_labels = [str(x)[:-1] for x in content]
train_labels = [labels.get(label, 0) for label in train_labels]

# load dev text
f = open('atis/dev/seq.in')
content = f.readlines()
dev_text = [str(x)[:-1] for x in content]
f = open('atis/dev/label')
content = f.readlines()
dev_labels = [str(x)[:-1] for x in content]
dev_labels = [labels.get(label, 0) for label in dev_labels]

# load test text
f = open('atis/test/seq.in')
content = f.readlines()
test_text = [str(x)[:-1] for x in content]
f = open('atis/test/label')
content = f.readlines()
test_labels = [str(x)[:-1] for x in content]
test_labels = [labels.get(label, 0) for label in test_labels]



### 1-A: Intent Classification with LSTM

#### Pre-Processing

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
num_words = 1000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_text)

maxlen = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
train_padded = np.array(train_padded)

val_sequence = tokenizer.texts_to_sequences(dev_text)
val_padded = pad_sequences(val_sequence, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
val_padded = np.array(val_padded)

test_sequence = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_sequence, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
test_padded = np.array(test_padded)

train_labels = np.array(train_labels)
dev_labels = np.array(dev_labels)
test_labels = np.array(test_labels)

In [33]:
max_sentence_length = maxlen
embedding_vector_length = len(train_padded)
model = Sequential()
model.add(Embedding(input_length=max_sentence_length, output_dim=num_labels, input_dim=embedding_vector_length))
model.add(Dropout(0.2))
model.add(LSTM(num_labels))
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

num_epochs = 3
batch_size = 64
model.fit(train_padded, train_labels.reshape(-1,1), validation_data=(val_padded, dev_labels.reshape(-1,1)), epochs=num_epochs, batch_size=batch_size, verbose=1)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 46, 22)            98516     
                                                                 
 dropout_8 (Dropout)         (None, 46, 22)            0         
                                                                 
 lstm_4 (LSTM)               (None, 22)                3960      
                                                                 
 dropout_9 (Dropout)         (None, 22)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 23        
                                                                 
Total params: 102,499
Trainable params: 102,499
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1dbae955070>

#### Evaluate LSTM

In [34]:
scores = model.evaluate(test_padded, test_labels.reshape(-1, 1), verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 3.70%


### 1-B: Inent Classification with BERT

#### Functions and Classes

In [36]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

class Dataset(torch.utils.data.Dataset):

    def __init__(self, text, text_labels):

        self.labels = text_labels
        self.texts = [tokenizer(t, padding='max_length',max_length=maxlen,  truncation=True, return_tensors="pt") for t in text]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        # return torch.Tensor(self.labels[idx])
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert = self.bert
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_labels)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer


def train(model, train_data, train_labels, val_data, val_labels, learning_rate, epochs):
    train, val = Dataset(train_data, train_labels), Dataset(val_data, val_labels)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    optimizer = Adam(model.parameters(), lr=learning_rate)

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            train_labelTensor = train_label.type(torch.LongTensor).to(device)
            batch_loss = criterion(output, train_labelTensor)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                val_labelTensor = val_label.type(torch.LongTensor).to(device)
                batch_loss = criterion(output, val_labelTensor)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')


def evaluate(model, test_data, test_labels):
    test = Dataset(test_data, test_labels)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)


            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Model Predictions: {output.argmax(dim=1)}')


#### Train BERT

In [37]:

EPOCHS = 5
model = BertClassifier()
LR = 1e-6
train(model, train_text, train_labels, dev_text, dev_labels, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 11.00 GiB total capacity; 418.80 MiB already allocated; 0 bytes free; 472.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

#### Evaluate BERT

In [None]:
def evaluate(model, test_data, test_labels):
    test = Dataset(test_data, test_labels)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    device = "cpu"
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Model Predictions: {output.argmax(dim=1)}')
evaluate(model, test_text, test_labels)
