# Set Seed and CUDA

In [1]:
import torch
import torchdata
import torchtext
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


# Dataset load and prep

In [2]:
from datasets import load_dataset

dataset = load_dataset("surrey-nlp/PLOD-CW")

training_set = dataset["train"]
print(len(training_set))
validation_set = dataset["validation"]
print(len(validation_set))
testing_set = dataset["test"]
print(len(testing_set))

1072
126
153


# Labels prep

In [3]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
labels_vocab = {
    "B-O": 0,
    "B-AC": 1,
    "B-LF": 2,
    "I-LF": 3,
}

# Bag of Words

In [105]:
from gensim.models import Word2Vec
from sklearn.svm import SVC
import numpy as np


X_raw = training_set["tokens"]
y_raw = training_set["ner_tags"]

X = [word for sublist in X_raw for word in sublist]
y = [label for sublist in y_raw for label in sublist]

# Train the model on your data (X)
print("Loading model")
# model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)
model = Word2Vec(sentences=X_raw, min_count=1, vector_size=100)
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")
print(model.wv.most_similar("This"))

print("Training model")
model.train(X, total_examples=model.corpus_count, epochs=10)

print("Getting word embeddings")
X_embeddings = np.array([model.wv[word] for word in X])

print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])

print("Fitting SVM model")
print(np.shape(X_embeddings), np.shape(y))
clf = SVC(kernel='linear', decision_function_shape="ovr")
clf.fit(X_embeddings, y)




Loading model
[('which', 0.9983901381492615), ('from', 0.998383641242981), ('to', 0.9983475208282471), ('that', 0.998332142829895), ('by', 0.9983232617378235), ('in', 0.9983189702033997), ('an', 0.9983177781105042), ('and', 0.9983092546463013), ('on', 0.9983035922050476), ('was', 0.9982937574386597)]
Training model
Getting word embeddings
Converting labels
Fitting SVM model
(40000, 100) (40000,)


In [108]:
from sklearn import metrics
X_test = [word for sublist in testing_set["tokens"] for word in sublist]
X_embeddings_test = np.array([model.wv[word] for word in X_test])
y_pred = clf.predict(X_embeddings_test)

y = [label for sublist in testing_set["ner_tags"] for label in sublist]
accuracy = metrics.accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy}")

1


NameError: name 'y_pred' is not defined

In [104]:
from gensim.models import Word2Vec
from sklearn.svm import SVC
import numpy as np


X_raw = [["This", "is", "a", "test"], ["This", "is", "record", "two"], ["Record", "three", "goes", "here"], ["Over", "here", "is", "record", "four"]]
y_raw = [["B-O", "B-AC", "B-AC", "B-O"], ["B-O", "B-O", "B-O", "B-O"], ["B-AC", "B-LF", "I-LF", "B-O"], ["B-O", "B-O", "B-O", "B-AC", "B-O"]]


X = [word for sublist in X_raw for word in sublist]
y = [label for sublist in y_raw for label in sublist]

# Train the model on your data (X)
print("Loading model")
# model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)
model = Word2Vec(sentences=X_raw, min_count=1, vector_size=100)
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")
print(model.wv.most_similar("This"))

print("Training model")
model.train(X, total_examples=model.corpus_count, epochs=10)

print("Getting word embeddings")
X_embeddings = np.array([model.wv[word] for word in X])

print("Converting labels")
integer_labels_2d = np.array([labels_vocab[label] for label in y])

print("Fitting SVM model")
print(np.shape(X_embeddings), np.shape(y))
clf = SVC(kernel='linear', decision_function_shape="ovr")
clf.fit(X_embeddings, y)

Loading model
[('Over', 0.13149002194404602), ('Record', 0.07497557997703552), ('here', 0.06797593086957932), ('goes', 0.04157782346010208), ('a', 0.04130810499191284), ('test', 0.012979976832866669), ('record', -0.013514931313693523), ('three', -0.013679763302206993), ('four', -0.044617101550102234), ('is', -0.1116705983877182)]
Training model
Getting word embeddings
Converting labels
Fitting SVM model
(17, 100) (17,)
