# Set Seed and CUDA

In [1]:
import torch
import torchtext
import gensim.downloader
import numpy as np
import torch
import torchtext
from sklearn.svm import SVC

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WORD2VEC_VECTORS = gensim.downloader.load("word2vec-google-news-300")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


# Dataset load and prep

In [2]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-CW")

training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))

1072
126
153


# Labels prep

In [3]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
labels_vocab = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}

# Word2Vec

In [4]:
def text_to_embedding(text):
    vectors = []
    for word in text:
        try:
            vector = WORD2VEC_VECTORS[word]
            vectors.append(vector)
        except:
            vectors.append(np.zeros(300,))
    return vectors

# SVM with Word2Vec

## Training

In [5]:
from sklearn.svm import SVC

X_raw = training_set["tokens"]
y_raw = training_set["ner_tags"]

X = [word for sublist in X_raw for word in sublist]
y = [label for sublist in y_raw for label in sublist]

print("Loading vectors")

print("Getting word embeddings")
X_embeddings = text_to_embedding(X)
print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])

print("Fitting SVM model")
print(np.shape(X_embeddings), np.shape(y))
clf = SVC(kernel='linear', decision_function_shape="ovr")
clf.fit(X_embeddings, y)

Loading vectors
Getting word embeddings
Converting labels
Fitting SVM model
(40000, 300) (40000,)


## Testing

In [6]:
from sklearn import metrics
X_test = [word for sublist in testing_set["tokens"] for word in sublist]
X_embeddings_test = text_to_embedding(X_test)
y_pred = clf.predict(X_embeddings_test)

y = [label for sublist in testing_set["ner_tags"] for label in sublist]
accuracy = metrics.accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8792


# RNN

## Training

  from .autonotebook import tqdm as notebook_tqdm
