## Download and prepare dataset

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m1.4

In [2]:
from datasets import load_dataset
dataset = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import pandas as pd

def prepare_dataset(dataset):
  sentence_index = 1

  rows = []
  for sentence in dataset:
    if len(sentence["tokens"]) == len (sentence["ner_tags"]):
      for index,token in enumerate(sentence["tokens"]):
        rows.append([sentence_index, token, sentence["ner_tags"][index]])
      sentence_index += 1
    else:
      print(f'Error in sentence id {sentence["id"]}')
  
  df = pd.DataFrame(rows, columns = ["Sentence #", "Word", "Tag"])
  return df

train_df = prepare_dataset(dataset["train"])
dev_df = prepare_dataset(dataset["validation"])
test_df = prepare_dataset(dataset["test"])

In [4]:
from future.utils import iteritems

# Keras (and most other ML packages) expect all the ids to be numeric, 
# this is an optimisation to save memory. 
# We will create the following dictionaries:
# word2idx: assign a numeric index to each word in the dataset
# idx2word: inverted version of word2idx
# tag2idx: assign a numeric index to each tag in the dataset
# idx2tag: inverted version of tag2idx

# Group training, dev and test data in order to create word-index dicts and to
# convert data to numeric indeces later
data = pd.concat([train_df, dev_df, test_df])

# words <= list of all words in the input dataset
words = list(set(data["Word"].values))
n_unique_words = len(words)

# tags <= list of all tags in the input dataset
tags = list(set(data["Tag"].values))
n_tags = len(tags)

# Dictionaries
word2idx = {w: i + 1 for i, w in enumerate(words)}
word2idx["PAD"] = 0 # Padding

idx2word = {i: w for w, i in iteritems(word2idx)}

# Retrieved from dataset webpage
tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

idx2tag = {v: k for k, v in iteritems(tag2idx)}

# Index number for the word 'business'
print(word2idx['business'])
# Word of index 100
print(idx2word[100])
# Index number for the tag 'B-ORG'
print(tag2idx['B-ORG'])
# Tag of index 2
print(idx2tag[2])

21501
Thuet
3
I-PER


In [5]:
# Explore the input dataset
print("Number of training sentences: ", len(train_df.groupby(['Sentence #'])))
print("Number of dev sentences: ", len(dev_df.groupby(['Sentence #'])))
print("Number of test sentences: ", len(test_df.groupby(['Sentence #'])))

training_words = train_df["Word"].values
n_training_words = len(training_words)
unique_training_words = list(set(training_words))
n_unique_training_words = len(unique_training_words)
print("Number of unique words in the training dataset: ", n_unique_training_words)
print("Number of words in the training dataset: ", n_training_words)
dev_words = dev_df["Word"].values
n_dev_words = len(dev_words)
unique_dev_words = list(set(dev_words))
n_unique_dev_words = len(unique_dev_words)
print("Number of unique words in the dev dataset: ", n_unique_dev_words)
print("Number of words in the dev dataset: ", n_dev_words)
test_words = test_df["Word"].values
n_test_words = len(test_words)
unique_test_words = list(set(test_words))
n_unique_test_words = len(unique_test_words)
print("Number of unique words in the test dataset: ", n_unique_test_words)
print("Number of words in the test dataset: ", n_test_words)

training_tags = list(set(train_df["Tag"].values))
print("Tags in the training dataset:", [idx2tag[i] for i in training_tags])
n_training_tags = len(training_tags)
print("Number of Labels in the training dataset: ", n_training_tags)
dev_tags = list(set(dev_df["Tag"].values))
print("Tags in the dev dataset:", [idx2tag[i] for i in dev_tags])
n_dev_tags = len(dev_tags)
print("Number of Labels in the dev dataset: ", n_dev_tags)
test_tags = list(set(test_df["Tag"].values))
print("Tags in the test dataset:", [idx2tag[i] for i in test_tags])
n_test_tags = len(test_tags)
print("Number of Labels in the test dataset: ", n_test_tags)

Number of training sentences:  14041
Number of dev sentences:  3250
Number of test sentences:  3453
Number of unique words in the training dataset:  23623
Number of words in the training dataset:  203621
Number of unique words in the dev dataset:  9966
Number of words in the dev dataset:  51362
Number of unique words in the test dataset:  9488
Number of words in the test dataset:  46435
Tags in the training dataset: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of Labels in the training dataset:  9
Tags in the dev dataset: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of Labels in the dev dataset:  9
Tags in the test dataset: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of Labels in the test dataset:  9


In [6]:
print("What the training dataset looks like:")
# Show the first 15 rows
train_df.head(n=15)

What the training dataset looks like:


Unnamed: 0,Sentence #,Word,Tag
0,1,EU,3
1,1,rejects,0
2,1,German,7
3,1,call,0
4,1,to,0
5,1,boycott,0
6,1,British,7
7,1,lamb,0
8,1,.,0
9,2,Peter,1


In [7]:
# SentenceGetter re-organize "data" as an array of sentences
# Each sentence is a list of pairs <word,tag> 
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

training_getter = SentenceGetter(train_df)
training_sentences = training_getter.sentences
dev_getter = SentenceGetter(dev_df)
dev_sentences = dev_getter.sentences
test_getter = SentenceGetter(test_df)
test_sentences = test_getter.sentences

# Example: training sentence #0
training_sentences[0]

[('EU', 3),
 ('rejects', 0),
 ('German', 7),
 ('call', 0),
 ('to', 0),
 ('boycott', 0),
 ('British', 7),
 ('lamb', 0),
 ('.', 0)]

In [8]:
# Convert train, dev and test data to numeric values
X_train = [[word2idx[w[0]] for w in s] for s in training_sentences]
y_train = [[w[1] for w in s] for s in training_sentences]

X_dev = [[word2idx[w[0]] for w in s] for s in dev_sentences]
y_dev = [[w[1] for w in s] for s in dev_sentences]

X_test = [[word2idx[w[0]] for w in s] for s in test_sentences]
y_test = [[w[1] for w in s] for s in test_sentences]

In [9]:
maxlen_X_train = max([len(s) for s in X_train])
maxlen_y_train = max([len(s) for s in y_train])
maxlen_X_dev = max([len(s) for s in X_dev])
maxlen_y_dev = max([len(s) for s in y_dev])
maxlen_X_test = max([len(s) for s in X_test])
maxlen_y_test = max([len(s) for s in y_test])

maxlen = max([maxlen_X_train, maxlen_y_train, maxlen_X_dev, maxlen_y_dev, maxlen_X_test, maxlen_y_test])

print(f"Max sentence length: {maxlen}")

Max sentence length: 124


In [10]:
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical

# Pad all sentences
padded_X_train = pad_sequences(maxlen=maxlen, sequences=X_train, padding="post",value=word2idx["PAD"])
padded_X_dev = pad_sequences(maxlen=maxlen, sequences=X_dev, padding="post",value=word2idx["PAD"])
padded_X_test = pad_sequences(maxlen=maxlen, sequences=X_test, padding="post",value=word2idx["PAD"])

padded_y_train = pad_sequences(maxlen=maxlen, sequences=y_train, padding="post", value=tag2idx["O"])
padded_y_dev = pad_sequences(maxlen=maxlen, sequences=y_dev, padding="post", value=tag2idx["O"])
padded_y_test = pad_sequences(maxlen=maxlen, sequences=y_test, padding="post", value=tag2idx["O"])

# Converts vector y (integers) to binary matrix.
# Each y is a one-hot vector where only the position of the tag gests "1"
categ_y_train = [to_categorical(i, num_classes=n_tags) for i in padded_y_train]
categ_y_dev = [to_categorical(i, num_classes=n_tags) for i in padded_y_dev]
categ_y_test = [to_categorical(i, num_classes=n_tags) for i in padded_y_test]

In [11]:
padded_X_train[0]

array([15812,  2790, 28480, 17661, 20018, 15339, 21650,  3244, 12060,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0], dtype=int32)

In [12]:
categ_y_train[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
# Download glove embedding vectors

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -oq glove.6B.zip

--2023-04-21 04:52:55--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-04-21 04:52:55--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-21 04:52:56--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [14]:
import numpy as np

path_to_glove_file = "./glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [15]:
num_tokens = n_unique_words + 1
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 14618 words (15672 misses)


In [33]:
from keras.models import Sequential, Model
# from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed
from keras.layers import LSTM, Embedding, Dense
from keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam

model = None
model = Sequential()
model.add(Embedding(input_dim=n_unique_words+1, output_dim=embedding_dim, 
                input_length=maxlen, weights=[embedding_matrix], mask_zero=True))
model.add(Bidirectional(LSTM(units=embedding_dim, return_sequences=True, 
                            dropout=0.2, recurrent_dropout=0.2)))
model.add(TimeDistributed(Dense(n_tags)))
adam = Adam(learning_rate=0.005)
model.compile(optimizer=adam, loss=['binary_crossentropy'], metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 124, 100)          3029000   
                                                                 
 bidirectional_3 (Bidirectio  (None, 124, 200)         160800    
 nal)                                                            
                                                                 
 time_distributed_3 (TimeDis  (None, 124, 9)           1809      
 tributed)                                                       
                                                                 
Total params: 3,191,609
Trainable params: 3,191,609
Non-trainable params: 0
_________________________________________________________________


In [34]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
callbacks_list = [early_stopping]

model.fit(padded_X_train, np.array(categ_y_train),
                    batch_size=16,
                    epochs=100,
                    validation_data=(padded_X_dev, np.array(categ_y_dev)),
                    validation_batch_size=512,
                    verbose=1,
                    callbacks=callbacks_list)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: early stopping


<keras.callbacks.History at 0x7fc6d8db4220>

In [35]:
!pip install seqeval
from seqeval.metrics import f1_score, classification_report

# This function converts predicted values to labels from idx2tag
# This is to improve the interpretability of the results
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out

test_pred = model.predict(padded_X_test, verbose=1)   

pred_labels = pred2label(test_pred)
test_labels = pred2label(categ_y_test)

print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))
report = classification_report(y_pred=pred_labels, y_true=test_labels, output_dict=True)
df = pd.DataFrame(report).transpose()
print("Classification Report:")
print(df)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
F1-score: 70.7%
Classification Report:
              precision    recall  f1-score  support
LOC            0.706042  0.777578  0.740086   1668.0
MISC           0.738806  0.705128  0.721574    702.0
ORG            0.637681  0.662252  0.649734   1661.0
PER            0.689927  0.758194  0.722451   1617.0
micro avg      0.685305  0.729108  0.706528   5648.0
macro avg      0.693114  0.725788  0.708461   5648.0
weighted avg   0.685397  0.729108  0.706165   5648.0


In [36]:
model.save(filepath="./h5_format", save_format="h5")

In [37]:
model.save(filepath="./saved_model_format")



In [38]:
!zip -r zip_saved_model saved_model_format/

  adding: saved_model_format/ (stored 0%)
  adding: saved_model_format/keras_metadata.pb (deflated 89%)
  adding: saved_model_format/variables/ (stored 0%)
  adding: saved_model_format/variables/variables.index (deflated 62%)
  adding: saved_model_format/variables/variables.data-00000-of-00001 (deflated 24%)
  adding: saved_model_format/saved_model.pb (deflated 91%)
  adding: saved_model_format/assets/ (stored 0%)
  adding: saved_model_format/fingerprint.pb (stored 0%)


In [32]:
!rm -r saved_model_format h5_format zip_saved_model.zip

In [39]:
# def save_backup_dict(dict, filename):
#   dict_file = open(filename, "wb")
#   pickle.dump(dict, dict_file)
#   dict_file.close()
#   gfile = drive.CreateFile({'parents': [{'id': "1j4HV2ycdxxvE_qZppOGy8mND_KB_OY7l"}]})
#   gfile.SetContentFile(filename)
#   gfile.Upload()

# word2idx_filename = 'jus_word2idx.pkl'
# tag2idx_filename = 'jus_tag2idx.pkl'


import pickle
dict_file = open('jus_word2idx.pkl', "wb")
pickle.dump(dict, dict_file)
dict_file.close()

dict_file = open('jus_tag2idx.pkl', "wb")
pickle.dump(dict, dict_file)
dict_file.close()

# save_backup_dict(word2idx, word2idx_filename)
# save_backup_dict(tag2idx, tag2idx_filename)