# Parsing the Dataset

### Read Dataset from Hugging Face

In [6]:
from datasets import load_dataset # Import dataset import function for hugging face
dataset = load_dataset("surrey-nlp/PLOD-CW") # import the coursework dataset from

#### Dataset Manipulation Instructions

**Dataset** splits into 3 distinct parts:
- train
- validation
- test

To access each call `dataset[set_name]`

For instance `dataset["train"]`

Each set has 3 **features**:
- tokens (words that form a sentance, split into a list)
- pos_tags (part of speech tags)
- ner_tags (name entity recognition tags)

To access each features, call `dataset[set_name][feature_name]`

For instance `dataset["train"]["tokens"]`

First element `dataset["train"]["tokens"][0]`

### Import Libraries

In [7]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import metrics

### Parse data into objects

In [8]:
train_tokens = dataset["train"]["tokens"]
train_pos_tags = dataset["train"]["pos_tags"]
train_ner_tags = dataset["train"]["ner_tags"]

validation_tokens = dataset["validation"]["tokens"]
validation_pos_tags = dataset["validation"]["pos_tags"]
validation_ner_tags = dataset["validation"]["ner_tags"]

test_tokens = dataset["test"]["tokens"]
test_pos_tags = dataset["test"]["pos_tags"]
test_ner_tags = dataset["test"]["ner_tags"]

class DataItem:
    def __init__(self, tokens, pos, ner):
        self.tokens:list[str] = tokens
        self.pos:list[str] = pos
        self.ner:list[str] = ner

    def get_as_tuple(self) -> tuple:
        return (self.tokens, self.pos, self.ner)
    
    def get_as_tuple_list(self) -> list[tuple]:
        tuple_list = []
        for idx in range(len(self.tokens)-1):
            tuple_list.append((self.tokens[idx], self.pos[idx], self.ner[idx]))
        return tuple_list
    
train_data:list[DataItem] = []
for idx in range(len(train_tokens)):
    train_data.append(DataItem(train_tokens[idx], train_pos_tags[idx], train_ner_tags[idx]))

validation_data:list[DataItem] = []
for idx in range(len(validation_tokens)):
    train_data.append(DataItem(validation_tokens[idx], validation_pos_tags[idx], validation_ner_tags[idx]))

test_data:list[DataItem] = []
for idx in range(len(test_tokens)):
    train_data.append(DataItem(test_tokens[idx], test_pos_tags[idx], test_ner_tags[idx]))

### Create tuples of token, pos and ner

In [9]:
def create_data_sents(data:list[DataItem]) -> list[tuple]:
    sent_list:list[tuple] = []
    for data_obj in data:
        sent_list.append(data_obj.get_as_tuple_list())
    return sent_list

train_sents:list[tuple] = create_data_sents(train_data)
validation_sents:list[tuple] = create_data_sents(validation_data)
test_sents:list[tuple] = create_data_sents(test_data)


## Training

In [10]:
def read_clusters(cluster_file):
    word2cluster = {}
    with open(cluster_file) as i:
        for line in i:
            word, cluster = line.strip().split('\t')
            word2cluster[word] = cluster
    return word2cluster


def word2features(sent, i, word2cluster):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.cluster=%s' % word2cluster[word.lower()] if word.lower() in word2cluster else "0",
        'postag=' + postag
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1
        ])
    else:
        features.append('BOS')

    if i > 1: 
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.extend([
            '-2:word.lower=' + word2.lower(),
            '-2:word.istitle=%s' % word2.istitle(),
            '-2:word.isupper=%s' % word2.isupper(),
            '-2:postag=' + postag2
        ])        

        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1
        ])
    else:
        features.append('EOS')

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.extend([
            '+2:word.lower=' + word2.lower(),
            '+2:word.istitle=%s' % word2.istitle(),
            '+2:word.isupper=%s' % word2.isupper(),
            '+2:postag=' + postag2
        ])

        
    return features


def sent2features(sent, word2cluster):
    return [word2features(sent, i, word2cluster) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, _, label in sent]

def sent2tokens(sent):
    return [token for token, _, _ in sent]

# word cluster is a set of embeddings where words are clustered togeter by word to embedding
# to similar embedding. In the example, the word row is unique, but the embedding row is not.
# In the example, this is trained on dutch wikipedia. However, we would need an english corpus
# for this to work, as the words we are training on are english.
# For now, the cluster will always be set to "0"
word2cluster = [] # Do not have word cluster right now, something to be done in the future

sent2features(train_sents[0], word2cluster)[0]

['bias',
 'word.lower=for',
 'word[-3:]=For',
 'word[-2:]=or',
 'word.isupper=False',
 'word.istitle=True',
 'word.isdigit=False',
 '0',
 'postag=ADP',
 'BOS',
 '+1:word.lower=this',
 '+1:word.istitle=False',
 '+1:word.isupper=False',
 '+1:postag=DET',
 '+2:word.lower=purpose',
 '+2:word.istitle=False',
 '+2:word.isupper=False',
 '+2:postag=NOUN']

In [11]:
x_train = [sent2features(s, word2cluster) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

x_validation = [sent2features(s, word2cluster) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]

x_test = [sent2features(s, word2cluster) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = crfsuite.CRF(
    verbose='true',
    algorithm='lbfgs',
    max_iterations=100
)

crf.fit(x_train, y_train, X_dev=x_validation, y_dev=y_validation)

loading training data to CRFsuite: 100%|██████████| 1351/1351 [00:00<00:00, 2837.07it/s]





loading dev data to CRFsuite: 0it [00:00, ?it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 64887
Seconds required: 0.126

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20



ZeroDivisionError: Fraction(0, 0)

## Create Embeddings

In [46]:
def sent2tokens(sent):
    return [token for token, _, _ in sent]

tokens = []
for sent in train_sents:
    temp_s = []
    for s in sent:
        temp_s.append(s[0])
    tokens.append(temp_s)
print(tokens)

# tokens = [y[0] for y in [x[0] for x in train_sents]]

import gensim
from gensim.test.utils import common_texts
# print(common_texts)
model = gensim.models.Word2Vec(tokens, min_count=1, vector_size=100, window=5)

for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

[['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(', 'GYPES', ')', 'was', 'developed'], ['The', 'following', 'physiological', 'traits', 'were', 'measured', ':', 'stomatal', 'conductance', '(', 'gs', ',', 'mol', 'H2O', 'm-2', 's-1', ')', ',', 'transpiration', 'rate', '(', 'E', ',', 'mmol', 'H2O', 'm-2', 's-1', ')', ',', 'net', 'photosynthetic', 'rate', '(', 'PN', ',', 'μmol', 'm-2', 's-1', ')', 'and', 'intercellular', 'CO2', 'concentration', 'CO2', '(', 'Ci', ',', 'μmol', 'm-2', 's-1', ')'], ['Minor', 'H', 'antigen', 'alloimmune', 'responses', 'readily', 'occur', 'in', 'the', 'setting', 'of', 'human', 'leukocyte', 'antigen', '(', 'HLA)–matched', 'allogeneic', 'solid', 'organ', 'and', 'stem', 'cell', 'transplantation', '(', 'SCT', ')', '[', '3,4', ']'], ['EPI', '=', 'Echo', 'planar', 'imaging'], ['Furthermore', ',', 'eNOS', '-', 'derived', 'NO', 'S', '-', 'nitrosylated', 'β', '-', 'actin', 'on', 'Cys374', 'and', 'impaired', 'actin', 'binding',