# Set Seed and CUDA

In [60]:
import torch
import torchtext
import gensim.downloader
import numpy as np
import torch
import torchtext
import tensorflow as tf
import nltk
import matplotlib.pyplot as plt
import os
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from collections import OrderedDict



SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORD2VEC_VECTORS = gensim.downloader.load("word2vec-google-news-300")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stopwords = set(stopwords.words('english'))

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

# Dataset load and prep

In [2]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-CW")

training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))

X_raw = training_set["tokens"]
y_raw = training_set["ner_tags"]

X = [word for sublist in X_raw for word in sublist]
y = [label for sublist in y_raw for label in sublist]

X_validation = [word for sublist in validation_set["tokens"] for word in sublist]
y_validation  = [label for sublist in validation_set["ner_tags"] for label in sublist]

y_test_tags = [label for sublist in testing_set["ner_tags"] for label in sublist]


  from .autonotebook import tqdm as notebook_tqdm


1072
126
153


# Dataset Analysis

In [None]:
def analyse_dataset(dataset, folder):
    # Count the number of instances of each tag
    ner_tags = [tag for record in dataset["ner_tags"] for tag in record]
    ner_tag_freq = Counter(ner_tags)

    # Create pie chart for ner tag frequency
    plt.figure(figsize=(4, 3))
    plt.pie(ner_tag_freq.values(), labels=ner_tag_freq.keys())
    plt.title("NER Distribution")
    plt.savefig(f"{folder}/ner-tag-distribution.png")
    plt.close()

    
    # Analysis of which pos tag each ner tag belongs to
    pos_tags = [tag for record in dataset["pos_tags"] for tag in record]
    tags_combined = zip(ner_tags, pos_tags)
    tags_freq = Counter(tags_combined)

    tags_freq_BAC = {}
    for item, counter in tags_freq.items():
        if item[0] == "B-AC":
            tags_freq_BAC[item] = counter
    
    plt.figure(figsize=(5, 4))
    plt.pie(tags_freq_BAC.values(), labels=tags_freq_BAC.keys())
    plt.title("POS Tag Distribution for B-AC NER")
    plt.savefig(f"{folder}/B-AC-POS.png")
    plt.close()

    tags_freq_BOC = {}
    for item, counter in tags_freq.items():
        if item[0] == "B-O":
            tags_freq_BOC[item[1]] = counter

    tags_freq_BOC_filtered = OrderedDict(sorted(tags_freq_BOC.items(), key=lambda x: x[1], reverse=True))
    plt.figure(figsize=(8,4))
    plt.bar(tags_freq_BOC_filtered.keys(), tags_freq_BOC_filtered.values())
    plt.xticks(rotation=65)
    plt.title("Non-Abbreviation POS distribution")
    plt.savefig(f"{folder}/non-abbrv.png")
    plt.close()
    
    
    

    
analyse_dataset(training_set, "training-plots")

# Labels prep

In [4]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
labels_vocab = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}
print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])
y_validation_integers = np.array([labels_vocab[label] for label in y_validation])
y_test_integers = np.array([labels_vocab[label] for label in y_test_tags])

Converting labels


# Experiement 1: Preprocessing

In [11]:
def preprocess(list, labels):
    print(np.shape(list))
    print(np.shape(labels))
    X_processed = []
    labels_processed = []

    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    for i, word in enumerate(list):
        if (word in stopwords):
            continue

        lemmat = lemmatizer.lemmatize(word)
        
        X_processed.append(ps.stem(word.lower()))
        # X_processed.append(lemmat.lower())
        labels_processed.append(labels[i])

    return X_processed, labels_processed

X_processed, integer_labels_2d_processed  = preprocess(X, integer_labels_2d)
X_validation_processed, y_validation_integer_2d_processed = preprocess(X_validation, y_validation_integers)

print(X_processed)

(40000,)
(40000,)
(5000,)
(5000,)
['for', 'purpos', 'gothenburg', 'young', 'person', 'empower', 'scale', '(', 'gype', ')', 'develop', '.', 'the', 'follow', 'physiolog', 'trait', 'measur', ':', 'stomat', 'conduct', '(', 'gs', ',', 'mol', 'h2o', 'm-2', 's-1', ')', ',', 'transpir', 'rate', '(', 'e', ',', 'mmol', 'h2o', 'm-2', 's-1', ')', ',', 'net', 'photosynthet', 'rate', '(', 'pn', ',', 'μmol', 'm-2', 's-1', ')', 'intercellular', 'co2', 'concentr', 'co2', '(', 'ci', ',', 'μmol', 'm-2', 's-1', ')', '.', 'minor', 'h', 'antigen', 'alloimmun', 'respons', 'readili', 'occur', 'set', 'human', 'leukocyt', 'antigen', '(', 'hla)–match', 'allogen', 'solid', 'organ', 'stem', 'cell', 'transplant', '(', 'sct', ')', '[', '3,4', ']', '.', 'epi', '=', 'echo', 'planar', 'imag', '.', 'furthermor', ',', 'eno', '-', 'deriv', 'no', 's', '-', 'nitrosyl', 'β', '-', 'actin', 'cys374', 'impair', 'actin', 'bind', 'profilin-1', '(', 'pfn1', ')', ',', 'confirm', 'transnitrosyl', 'agent', 's', '-', 'nitroso', '-', '

# Word2Vec

In [12]:
def text_to_embedding(text):
    vectors = []
    for word in text:
        try:
            vector = WORD2VEC_VECTORS[word]
            vectors.append(vector)
        except:
            vectors.append(np.zeros(300,))
    return vectors

X_embeddings = text_to_embedding(X)
X_embeddings_preprocessed = text_to_embedding(X_processed)

X_embeddings_validation = text_to_embedding(X_validation)

X_test = [word for sublist in testing_set["tokens"] for word in sublist]
X_embeddings_test = text_to_embedding(X_test)

# tf-idf

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
X_tfidf_validation = tfidf.transform(X_validation)
X_tfidf_test = tfidf.transform(X_test)

# Training and Evaluation

## tf-idf

In [8]:
print("Fitting SVM model")
svm_tf_idf = SVC(kernel="linear", decision_function_shape="ovr")
svm_tf_idf.fit(X_tfidf, integer_labels_2d)

Fitting SVM model


In [9]:

y_pred_tf_idf = svm_tf_idf.predict(X_tfidf_test)

f1 = metrics.f1_score(y_test_integers, y_pred_tf_idf, average='micro')
print(f"f1 score: {f1}")

f1 score: 0.8632


## Word2Vec

In [10]:
# Without preprocessing
print("Fitting SVM model")
print(np.shape(X_embeddings), np.shape(integer_labels_2d))
svm_word2vec = SVC(kernel='linear', decision_function_shape="ovr")
svm_word2vec.fit(X_embeddings, integer_labels_2d)

svm_word2vec_predictions = svm_word2vec.predict(X_embeddings_validation)
svm_word2vec_f1 = metrics.f1_score(y_test_integers, svm_word2vec_predictions, average="micro")
print(svm_word2vec_f1)

Fitting SVM model
(40000, 300) (40000,)
0.8322


In [13]:
# With preprocessing
print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])

print("Fitting SVM model")
print(np.shape(X_embeddings_preprocessed), np.shape(integer_labels_2d_processed))
svm_word2vec = SVC(kernel='linear', decision_function_shape="ovr")
svm_word2vec.fit(X_embeddings_preprocessed, integer_labels_2d_processed)

Converting labels
Fitting SVM model
(32109, 300) (32109,)


In [14]:
svm_word2vec_predictions = svm_word2vec.predict(X_embeddings_validation)
svm_word2vec_f1 = metrics.f1_score(y_test_integers, svm_word2vec_predictions, average="micro")
print(svm_word2vec_f1)

0.8518000000000001


In [20]:
svm_squared_hinge = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss"))
svm_squared_hinge.fit(X_embeddings, integer_labels_2d)

In [21]:
svm_perceptron = make_pipeline(StandardScaler(), SGDClassifier(loss="perceptron"))
svm_perceptron.fit(X_embeddings, integer_labels_2d)

In [22]:
svm_modified_huber = make_pipeline(StandardScaler(), SGDClassifier(loss="modified_huber"))
svm_modified_huber.fit(X_embeddings, integer_labels_2d)

In [23]:

y_pred = svm_word2vec.predict(X_embeddings_test)
f1 = metrics.f1_score(y_validation_integers, y_pred, average='micro')
print(f"\nHinge loss (default) f1 score: {f1}")

svm_squared_hinge_predictions = svm_squared_hinge.predict(X_embeddings_validation)
svm_squared_hinge_f1 = metrics.f1_score(y_validation_integers, svm_squared_hinge_predictions, average="micro")
print(f"Logistic (cross-entropy) loss f1: {svm_squared_hinge_f1}")

svm_perceptron_predictions = svm_perceptron.predict(X_embeddings_validation)
svm_perceptron_f1 = metrics.f1_score(y_validation_integers, svm_perceptron_predictions, average="micro")
print(f"Perceptron loss f1: {svm_perceptron_f1}")

svm_modified_huber_predictions = svm_modified_huber.predict(X_embeddings_validation)
svm_modified_huber_f1 = metrics.f1_score(y_validation_integers, svm_modified_huber_predictions, average="micro")
print(f"Modified Huber loss f1: {svm_modified_huber_f1}")



Hinge loss (default) f1 score: 0.8234
Logistic (cross-entropy) loss f1: 0.8688
Perceptron loss f1: 0.8276
Modified Huber loss f1: 0.8554


In [14]:
import optuna
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-5, 1e5, log=True)
    eta0 = trial.suggest_float('eta0', 1e-4, 1e4, log=True)
    
    svm_model = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", alpha=alpha, learning_rate="adaptive", eta0=eta0))

    svm_model.fit(X_embeddings, integer_labels_2d)

    optim_pred = svm_model.predict(X_embeddings_validation)
    optim_f1 = metrics.f1_score(y_test_integers, optim_pred, average="micro")

    return optim_f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

best_params = study.best_params

svm_optimized = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", alpha=best_params["alpha"], learning_rate="adaptive", eta0=best_params["eta0"]))
svm_optimized.fit(X_embeddings, integer_labels_2d)

svm_optimized_predictions = svm_optimized.predict(X_embeddings_test)
svm_optimized_f1 = metrics.f1_score(y_test_integers, svm_optimized_predictions, average="micro")
print(f"SVM optimized f1: {svm_optimized_f1}")

[I 2024-04-11 09:38:29,709] A new study created in memory with name: no-name-39a9ed28-d1b8-4350-bb8d-46c55b5e2cc3
[I 2024-04-11 09:38:43,691] Trial 0 finished with value: 0.8584 and parameters: {'alpha': 88693.42799268964, 'eta0': 3.011406985590487}. Best is trial 0 with value: 0.8584.
[I 2024-04-11 09:38:50,858] Trial 1 finished with value: 0.8119999999999999 and parameters: {'alpha': 2.8278800639035372e-05, 'eta0': 0.0032688048009385487}. Best is trial 0 with value: 0.8584.
[I 2024-04-11 09:39:00,658] Trial 2 finished with value: 0.8119999999999999 and parameters: {'alpha': 0.0006411173512018305, 'eta0': 0.04729088056436915}. Best is trial 0 with value: 0.8584.
[I 2024-04-11 09:39:24,049] Trial 3 finished with value: 0.8104 and parameters: {'alpha': 1.5106935753491572e-05, 'eta0': 61.62153944633049}. Best is trial 0 with value: 0.8584.
[I 2024-04-11 09:39:31,551] Trial 4 finished with value: 0.8115999999999999 and parameters: {'alpha': 4.0352655562422593e-05, 'eta0': 0.01435734520101

SVM optimized f1: 0.8584


In [15]:
print(f"l2 penalty, alpha=0.0001, max_iter=1000, learning_rate=\"optimal\" f1: {svm_squared_hinge_f1}")


svm_l1 = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", penalty="l1"))
svm_l1.fit(X_embeddings, y)

svm_l1_predictions = svm_l1.predict(X_embeddings_test)
svm_l1_f1 = metrics.f1_score(svm_l1_predictions, y_pred, average="micro")
print(f"l1 penalty f1: {svm_l1_f1}")


svm_alpha = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", alpha=0.0005))
svm_alpha.fit(X_embeddings, y)

svm_alpha_predictions = svm_alpha.predict(X_embeddings_test)
svm_alpha_f1 = metrics.f1_score(svm_alpha_predictions, y_pred, average="micro")
print(f"alpha=0.0005 f1: {svm_alpha_f1}")


svm_learning_rate = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.01))
svm_learning_rate.fit(X_embeddings, y)

svm_learning_rate_predictions = svm_learning_rate.predict(X_embeddings_test)
svm_learning_rate_f1 = metrics.f1_score(svm_learning_rate_predictions, y_pred, average="micro")
print(f"learning_rate=\"adaptive\" f1: {svm_learning_rate_f1}")


svm_final = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss", penalty="l1", learning_rate="adaptive", eta0=0.01))
svm_final.fit(X_embeddings, y)

svm_final_predictions = svm_final.predict(X_embeddings_test)
svm_final_f1 = metrics.f1_score(svm_final_predictions, y_pred, average="micro")
print(f"l1 penalty, learning_rate=\"adaptive\": {svm_final_f1}")


l2 penalty, alpha=0.0001, max_iter=1000, learning_rate="optimal" f1: 0.8671999999999999


ValueError: Mix of label input types (string and number)

# RNN

In [None]:
X_tensors = []
for X in X_embeddings:
    X_tensors.append(torch.Tensor(X))

print(np.shape(X_embeddings))
print(np.shape(X_tensors))

X_tensor = torch.stack(X_tensors, dim=0)
print(np.shape(X_tensor))

  X_tensors.append(torch.Tensor(X))


(40000, 300)
(40000,)
torch.Size([40000, 300])


  result = asarray(a).shape
  result = asarray(a).shape


In [None]:
labels_tensors = [torch.as_tensor([label]).to(DEVICE) for label in integer_labels_2d]

integer_labels_2d_test = [labels_vocab[label] for sublist in testing_set["ner_tags"] for label in sublist]

labels_tensors_test = [torch.as_tensor([label]).to(DEVICE) for label in integer_labels_2d_test]

In [None]:
# Should be 1D int64 CPU tensor
lengths = tf.reshape(tf.convert_to_tensor([np.array([len(label) for label in X_embeddings], dtype=np.int64)]), [-1])
lengths_test = tf.reshape(tf.convert_to_tensor(np.array([len(label) for label in X_embeddings_test], dtype=np.int64)), [-1])

print(lengths[0])
print(len(lengths[0]))

tf.Tensor(300, shape=(), dtype=int32)


TypeError: Scalar tensor has no `len()`

In [None]:
from torch.utils.data import DataLoader

training_set_loader = zip(X_embeddings, labels_tensors, lengths)

testing_set_loader = zip(X_embeddings_test, labels_tensors_test, lengths_test)


In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):        
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, texts, lengths):
        embedded = self.embedding(texts)                          # VV note that lengths need to be on the CPU
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output, hidden = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

INPUT_DIM = len(X_tensors)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

print(f"{sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters")

4091905 parameters


## Training

In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

from tqdm import tqdm

def train(model, iterator, optimizer, criterion):    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator, desc="\tTraining"):
        optimizer.zero_grad()
                
        labels, texts, lengths = batch  # Note that this has to match the order in collate_batch
        predictions = model(texts, lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

from tqdm import tqdm

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(iterator, desc="\tEvaluation"):
            labels, texts, lengths = batch  # Note that this has to match the order in collate_batch
            predictions = model(texts, lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 5

best_valid_loss = float('inf')
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'} for training.")

for epoch in range(N_EPOCHS):
    print(f'Epoch: {epoch+1:02}')
    start_time = time.time()
    
    train_loss, train_acc = train(model, training_set_loader, optimizer, criterion)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    
    valid_loss, valid_acc = evaluate(model, testing_set_loader, criterion)
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

Using GPU for training.
Epoch: 01


	Training: 0it [00:00, ?it/s]


TypeError: Scalar tensor has no `len()`

## Testing

In [None]:
# Evaluate on test data
test_loader = DataLoader(X_tensors, batch_size=32)
model.eval()
correct = 0
total = 0

with torch.no_grad():
  for data in test_loader:
    text, label = data.text, data.label
    prediction = model(text)
    predicted = (torch.round(prediction.squeeze()) > 0.5).float()
    correct += (predicted == label).sum().item()
    total += label.shape[0]

# Calculate accuracy
accuracy = correct / total

print(accuracy)
