## Part 1 Sequence Tagging: NER

In [1]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors
import tensorflow as tf
import numpy as np

from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import InputLayer
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

### 1.1 Word Embedding

In [2]:
# Download the pretrained word2vec embeddings and save the model
# Uncomment the lines below to download and save the pretrained model

# google_vectors = gensim.downloader.load('word2vec-google-news-300')
# google_vectors.save('./data/word2vec.model')

In [3]:
# Load the model
w2v = KeyedVectors.load('./data/word2vec.model')

In [5]:
#Cosine similarity
student = w2v.most_similar('student')[0]
Apple = w2v.most_similar('Apple')[0]
apple = w2v.most_similar('apple')[0]

print(f'The most similar word to student is {student[0]} with a cosine similarity of {student[1]}')
print(f'The most similar word to Apple is {Apple[0]} with a cosine similarity of {Apple[1]}')
print(f'The most similar word to apple is {apple[0]} with a cosine similarity of {apple[1]}')

The most similar word to student is students with a cosine similarity of 0.7294865846633911
The most similar word to Apple is Apple_AAPL with a cosine similarity of 0.7456987500190735
The most similar word to apple is apples with a cosine similarity of 0.720359742641449


### 1.2 Data

First step is to write functions that get the sentences and the tags from the train, development, and test data

In [6]:
def getSentences(path):
    file_path = path
    sentences = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    word = line.split()[0]
                    current_sentence.append(word)
                else:
                    if current_sentence:
                        sentences.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                sentences.append(current_sentence)
            return sentences
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

def getTags(path):
    file_path = path
    tags = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    tag = line.split()[3]
                    current_sentence.append(tag)
                else:
                    if current_sentence:
                        tags.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                tags.append(current_sentence)
            return tags
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [12]:
train_sentences = getSentences('./data/eng.train')
train_tags = getTags('./data/eng.train')
development_sentences = getSentences('./data/eng.testa')
development_tags = getTags('./data/eng.testa')
test_sentences = getSentences('./data/eng.testb')
test_tags = getTags('./data/eng.testb')

#### a) Describe the size of the datasets and the complete set of all possible word labels

In [13]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(development_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


In [16]:
all_tags = [tag for sentence in train_tags for tag in sentence]
unique_tags = set(all_tags)
print("Word Labels: ", unique_tags)

Word Labels:  {'B-MISC', 'I-PER', 'I-LOC', 'I-MISC', 'O', 'B-ORG', 'B-LOC', 'I-ORG'}


#### b) Choose an example sentence from the Training set containing at least two named entities with more than one word

In [20]:
def get_data(path):
    try:
        with open(path, 'r') as file:
            data = file.readlines()
        file.close()
    except Exception as e:
        data = None
        print(e)
    
    return data

In [21]:
train_data = get_data('./data/eng.train')

In [58]:
data_text = ''.join(train_data)

sentences = data_text.strip().split('\n\n')

for sentence in sentences:
    lines = sentence.split('\n')

    named_entities_count = sum(1 for line in lines if (len(line.split()) > 2) 
                                                   and line.split()[3].startswith('I'))

    if named_entities_count >= 2:
        words = [line.split()[0] for line in lines]
        print(sentence)
        print()
        print(' '.join(words))
        break

EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O

EU rejects German call to boycott British lamb .


From the labels, you can form complete named entities as follows: 

EU (I-ORG): "EU" is the complete named entity, and the label "I-ORG" indicates that it is an organization. 

German (I-MISC): "German" is an incomplete named entity, and the label "I-MISC" indicates that it is a miscellaneous entity. 

British (I-MISC): "British" is another incomplete named entity labeled as "I-MISC." 

So, in this sentence, "EU" is a complete named entity, and "German" and "British" are incomplete named entities. The specific nature of the incomplete entities is not specified in this sentence. 

### 1.3 Model

#### Preprocess the data

In [27]:
train_path = './data/eng.train'
development_path = './data/eng.testa'
test_path = './data/eng.testb'

In [28]:
train_data = get_data(train_path)
development_data = get_data(development_path)
test_data = get_data(test_path)

In [29]:
def extract_sentences(data):
    split_data = [line.split(' ') for line in data] if data != None else []
    sentences = []
    current_sentence = []
    for line in split_data:
        if line == ['\n']:
            sentences.append(current_sentence)
            current_sentence = []
        else:
            word = line[0]
            tag = line[-1].replace('\n', '')
            current_sentence.append([word, tag])
    sentences.append(current_sentence)
    return sentences

def extract_words_and_tags(data):
    words = []
    tags = []
    for line in data:
        parts = line.split()
        if parts:
            word = parts[0]
            tag = parts[-1]
            words.append(word)
            tags.append(tag)
    return words, tags

In [30]:
train_sentences = extract_sentences(train_data)
train_words, train_tags = extract_words_and_tags(train_data)
development_sentences = extract_sentences(development_data)
development_words, development_tags = extract_words_and_tags(development_data)
test_sentences = extract_sentences(test_data)
test_words, test_tags = extract_words_and_tags(test_data)

In [31]:
train_voc = np.unique(np.array(train_words))
dev_voc = np.unique(np.array(development_words))
tag_set = np.unique(np.array(train_tags))

In [33]:
train_pretrained_weights = w2v.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = w2v.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = word2idx.keys()
# voc = np.append(voc,'<UNK>')
# voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}
tag2idx['<PAD>'] = 8
index_to_label={v:k for k,v in tag2idx.items()}

In [34]:
num_tokens = len(voc)
embedding_dim = 50

In [35]:
def get_x_embeddings(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  return sequence

In [46]:
import imblearn
from imblearn.under_sampling import CondensedNearestNeighbour
undersample = CondensedNearestNeighbour(n_neighbors=1)
from sklearn.utils.class_weight import compute_class_weight

x_train = get_x_embeddings(train_sentences)
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
x_dev = get_x_embeddings(development_sentences)
y_dev = [[tag2idx[w[1]] for w in s] for s in development_sentences]
x_test = get_x_embeddings(test_sentences)
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post")
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post")
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post")
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post")
x_test = pad_sequences(maxlen=embedding_dim, sequences=x_test, padding="post")
y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post")

y_train_1D=y_train.reshape(-1)
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train_1D),
                                        y =y_train_1D                                           
                                    )
class_weights = dict(zip(np.unique(y_train_1D), class_weights))

y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

In [63]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout,BatchNormalization,Bidirectional
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam
from sklearn.metrics import f1_score
from seqeval.metrics import f1_score as seq_f1_score
from keras.callbacks import Callback
from seqeval.scheme import IOB1

num_classes=8
sequence_length=50
output_shape=(sequence_length,num_classes)

class F1ScoreCallback(Callback):
    def __init__(self, validation_data,index_to_label):
        super().__init__()
        self.validation_data = validation_data
        self.index_to_label=index_to_label
        self.best_f1 = 0
        self.wait = 0
        self.patience = 10

    def on_epoch_end(self, epoch, logs=None):

        X_val, y_val = self.validation_data
        y_pred = self.model.predict(X_val)
        
        y_true = self.convert_to_labels(y_val)
        y_pred = self.convert_to_labels(y_pred)

        f1 = seq_f1_score(y_true, y_pred, scheme=IOB1)
        print(f' - F1 Score: {f1}')
        
        if f1 > self.best_f1:
            self.best_f1 = f1
            self.wait = 0
        else:
            self.wait += 1

        if self.wait >= self.patience:
            self.model.stop_training = True

    def convert_to_labels(self, data):
        labels = []
        for seq in data:
            label_seq = [self.index_to_label[np.argmax(token)] for token in seq]
            labels.append(label_seq)
        return labels

   
# Define and compile the model
model = Sequential()
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=sequence_length, trainable=False))
model.add(Bidirectional(LSTM(units=128, return_sequences=False)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.L1L2(l1=0.25, l2=0.25)))
model.add(Dense(num_classes*sequence_length, activation='softmax'))
model.add(tf.keras.layers.Reshape(output_shape))

optimizer = Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
f1_callback = F1ScoreCallback(validation_data=(x_dev, y_dev),index_to_label=index_to_label)
early_stopping = EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)

model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 50, 50)            150000100 
                                                                 
 bidirectional_11 (Bidirect  (None, 256)               183296    
 ional)                                                          
                                                                 
 batch_normalization_11 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_11 (Dropout)        (None, 256)               0         
                                                                 
 dense_22 (Dense)            (None, 512)               131584    
                                                                 
 dense_23 (Dense)            (None, 400)             

In [64]:
num_epochs = 20
batch_size = 1024

model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=num_epochs, validation_data=(x_dev, y_dev), callbacks=[early_stopping,f1_callback, lr_scheduler], class_weight=class_weights)

Epoch 1/20
 - F1 Score: 0.8194334725200315
Epoch 2/20
 - F1 Score: 0.832748189552729
Epoch 3/20
 - F1 Score: 0.8439918561764818
Epoch 4/20
 - F1 Score: 0.8460465932915852
Epoch 5/20
 - F1 Score: 0.8505142344010888
Epoch 6/20
 - F1 Score: 0.8485974544720708
Epoch 7/20
 - F1 Score: 0.8599688064069537
Epoch 8/20
 - F1 Score: 0.8194334725200315
Epoch 9/20
 - F1 Score: 0.8267953322872531
Epoch 10/20
 - F1 Score: 0.8457983402077224
Epoch 11/20
 - F1 Score: 0.8443338169466351
Epoch 12/20
 - F1 Score: 0.8535653709752029
Epoch 13/20
 - F1 Score: 0.850461580936276
Epoch 14/20
 - F1 Score: 0.8307982874157317
Epoch 15/20
 - F1 Score: 0.8457047825015741
Epoch 16/20
 - F1 Score: 0.8433904979587341
Epoch 17/20
 - F1 Score: 0.8500047066370555


<keras.src.callbacks.History at 0x1de5ab35850>

In [54]:
from seqeval.metrics import f1_score

# print(f' - F1 Score: {f1}')
predictions = model.predict(x_test)
print(predictions.shape)
print(y_test.shape)
predicted_label_sequences = []  # List to store predicted label sequences
true_label_sequences = []  # List to store true label sequences
for sequence in predictions:
    # Decode the predicted labels for each sequence
    decoded_sequence = [index_to_label[np.argmax(label,axis=-1)] for label in sequence]
    predicted_label_sequences.append(decoded_sequence)
for sequence in y_test:
    # Decode the true labels for each sequence
    decoded_sequence = [index_to_label[np.argmax(label,axis=-1)] for label in sequence]
    true_label_sequences.append(decoded_sequence)
f1 = f1_score(true_label_sequences, predicted_label_sequences, scheme=IOB1)
print("F1 Score:", f1)

(3684, 50, 8)
(3684, 50, 8)
F1 Score: 0.8734398715520376
