# Set Seed and CUDA

In [1]:
import torch
import torchtext
import gensim.downloader
import numpy as np
import torch
import torchtext
import tensorflow as tf
from sklearn.svm import SVC

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORD2VEC_VECTORS = gensim.downloader.load("word2vec-google-news-300")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


# Dataset load and prep

In [2]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-CW")

training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))

1072
126
153


# Labels prep

In [3]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
labels_vocab = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}

# Word2Vec

In [4]:
def text_to_embedding(text):
    vectors = []
    for word in text:
        try:
            vector = WORD2VEC_VECTORS[word]
            vectors.append(vector)
        except:
            vectors.append(np.zeros(300,))
    return vectors


X_raw = training_set["tokens"]
y_raw = training_set["ner_tags"]

X = [word for sublist in X_raw for word in sublist]
y = [label for sublist in y_raw for label in sublist]

X_embeddings = text_to_embedding(X)

# SVM with Word2Vec

## Training

In [5]:
from sklearn.svm import SVC

print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])

print("Fitting SVM model")
print(np.shape(X_embeddings), np.shape(y))
clf = SVC(kernel='linear', decision_function_shape="ovr")
clf.fit(X_embeddings, y)

Converting labels
Fitting SVM model
(40000, 300) (40000,)


## Testing

In [6]:
from sklearn import metrics
X_test = [word for sublist in testing_set["tokens"] for word in sublist]
X_embeddings_test = text_to_embedding(X_test)
y_pred = clf.predict(X_embeddings_test)

y = [label for sublist in testing_set["ner_tags"] for label in sublist]
accuracy = metrics.accuracy_score(y, y_pred)
precision = metrics.precision_score(y, y_pred, average='micro')
recall = metrics.recall_score(y, y_pred, average='micro')
f1 = metrics.f1_score(y, y_pred, average='micro')
print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nf1 score: {f1}")

Accuracy: 0.8792
Precision: 0.8792
Recall: 0.8792
f1 score: 0.8792


# RNN

In [7]:
X_tensors = []
for X in X_embeddings:
    X_tensors.append(torch.Tensor(X))

print(np.shape(X_embeddings))
print(np.shape(X_tensors))

X_tensor = torch.stack(X_tensors, dim=0)
print(np.shape(X_tensor))

  X_tensors.append(torch.Tensor(X))


(40000, 300)
(40000,)
torch.Size([40000, 300])


  result = asarray(a).shape
  result = asarray(a).shape


In [8]:
labels_tensors = [torch.as_tensor([label]).to(DEVICE) for label in integer_labels_2d]

integer_labels_2d_test = [labels_vocab[label] for sublist in testing_set["ner_tags"] for label in sublist]

labels_tensors_test = [torch.as_tensor([label]).to(DEVICE) for label in integer_labels_2d_test]

In [18]:

# Should be 1D int64 CPU tensor
lengths = tf.reshape(tf.convert_to_tensor(np.array([len(label) for label in X_embeddings], dtype=np.int64)), [-1])
lengths_test = torch.Tensor([len(label) for label in X_embeddings_test]).cpu()


In [19]:
from torch.utils.data import DataLoader

training_set_loader = zip(X_embeddings, labels_tensors, lengths)

testing_set_loader = zip(X_embeddings_test, labels_tensors_test, lengths_test)


In [20]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):        
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, texts, lengths):
        embedded = self.embedding(texts)                          # VV note that lengths need to be on the CPU
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output, hidden = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

INPUT_DIM = len(X_tensors)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

print(f"{sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters")

4091905 parameters


## Training

In [22]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

from tqdm import tqdm

def train(model, iterator, optimizer, criterion):    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator, desc="\tTraining"):
        optimizer.zero_grad()
                
        labels, texts, lengths = batch  # Note that this has to match the order in collate_batch
        predictions = model(texts, lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

from tqdm import tqdm

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator, desc="\tEvaluation"):
            labels, texts, lengths = batch  # Note that this has to match the order in collate_batch
            predictions = model(texts, lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 5

best_valid_loss = float('inf')
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'} for training.")

for epoch in range(N_EPOCHS):
    print(f'Epoch: {epoch+1:02}')
    start_time = time.time()
    
    train_loss, train_acc = train(model, training_set_loader, optimizer, criterion)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    
    valid_loss, valid_acc = evaluate(model, testing_set_loader, criterion)
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

Using GPU for training.
Epoch: 01


	Training: 0it [00:00, ?it/s]


RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 0D cpu Long tensor

## Testing

In [None]:
# Evaluate on test data
test_loader = DataLoader(X_tensors, batch_size=32)
model.eval()
correct = 0
total = 0

with torch.no_grad():
  for data in test_loader:
    text, label = data.text, data.label
    prediction = model(text)
    predicted = (torch.round(prediction.squeeze()) > 0.5).float()
    correct += (predicted == label).sum().item()
    total += label.shape[0]

# Calculate accuracy
accuracy = correct / total

print(accuracy)
