In [1]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors
import tensorflow as tf
import numpy as np

from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import InputLayer
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

#### Train word2vec model

In [2]:
def getSentences(path):
    file_path = path
    sentences = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    word = line.split()[0]
                    current_sentence.append(word)
                else:
                    if current_sentence:
                        sentences.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                sentences.append(current_sentence)
            return sentences
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

def getTags(path):
    file_path = path
    tags = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    tag = line.split()[3]
                    current_sentence.append(tag)
                else:
                    if current_sentence:
                        tags.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                tags.append(current_sentence)
            return tags
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [3]:
train_sentences = getSentences('./data/eng.train')
train_tags = getTags('./data/eng.train')

In [4]:
# train_w2v = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)

#### Load w2v Model

In [5]:
w2v = gensim.downloader.load("word2vec-google-news-300")

#### Preprocess Train, Development and Test Data

In [6]:
train_path = './data/eng.train'
development_path = './data/eng.testa'
test_path = './data/eng.testb'

In [7]:
def get_data(path):
    try:
        with open(path, 'r') as file:
            data = file.readlines()
        file.close()
    except Exception as e:
        data = None
        print(e)
    
    return data

In [8]:
train_data = get_data(train_path)
development_data = get_data(development_path)
test_data = get_data(test_path)

In [9]:
def extract_sentences(data):
    split_data = [line.split(' ') for line in data] if data != None else []
    sentences = []
    current_sentence = []
    for line in split_data:
        if line == ['\n']:
            sentences.append(current_sentence)
            current_sentence = []
        else:
            word = line[0]
            tag = line[-1].replace('\n', '')
            current_sentence.append([word, tag])
    sentences.append(current_sentence)
    return sentences

def extract_words_and_tags(data):
    words = []
    tags = []
    for line in data:
        parts = line.split()
        if parts:
            word = parts[0]
            tag = parts[-1]
            words.append(word)
            tags.append(tag)
    return words, tags

In [10]:
train_sentences = extract_sentences(train_data)
train_words, train_tags = extract_words_and_tags(train_data)
development_sentences = extract_sentences(development_data)
development_words, development_tags = extract_words_and_tags(development_data)
test_sentences = extract_sentences(test_data)
test_words, test_tags = extract_words_and_tags(test_data)

In [11]:
train_voc = np.unique(np.array(train_words))
dev_voc = np.unique(np.array(development_words))
tag_set = np.unique(np.array(train_tags))

In [12]:
tag_set

array(['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER',
       'O'], dtype='<U6')

In [13]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(development_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


In [14]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


In [15]:
# train_pretrained_weights = train_w2v.wv.vectors
# train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

# word2idx = train_w2v.wv.key_to_index
# word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
# word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1

# voc = train_voc
# voc = np.append(voc,'<UNK>')
# voc = np.append(voc,'<PAD>')

# tag2idx = {k: v for v, k in enumerate(tag_set)}

In [16]:
train_pretrained_weights = w2v.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = w2v.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = word2idx.keys()
# voc = np.append(voc,'<UNK>')
# voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}
tag2idx['<PAD>'] = 8
index_to_label={v:k for k,v in tag2idx.items()}

In [17]:
index_to_label

{0: 'B-LOC',
 1: 'B-MISC',
 2: 'B-ORG',
 3: 'I-LOC',
 4: 'I-MISC',
 5: 'I-ORG',
 6: 'I-PER',
 7: 'O',
 8: '<PAD>'}

In [18]:
num_tokens = len(voc)
embedding_dim = 50


In [19]:
def get_x_embeddings(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  return sequence

In [20]:
import imblearn
from imblearn.under_sampling import CondensedNearestNeighbour
undersample = CondensedNearestNeighbour(n_neighbors=1)

x_train = get_x_embeddings(train_sentences)
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
x_dev = get_x_embeddings(development_sentences)
y_dev = [[tag2idx[w[1]] for w in s] for s in development_sentences]
x_test = get_x_embeddings(test_sentences)
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

#x_train,y_train=undersample.fit_resample(x_train, y_train)

x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post")
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post", value=tag2idx['<PAD>'])
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post")
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post", value=tag2idx['<PAD>'])
x_test = pad_sequences(maxlen=embedding_dim, sequences=x_test, padding="post")
y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post", value=tag2idx['<PAD>'])

y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

In [21]:
y_train.shape

(14987, 50, 9)

In [22]:
# import tensorflow as tf
# from tensorflow.keras import layers, Sequential
# import numpy as np
# from tensorflow.keras.optimizers import Adam,SGD,Adagrad

# num_epochs = 200
# batch_size = 50
# num_classes = len(tag_set)
# sequence_length = 50
# output_shape=(sequence_length,9)

# model = Sequential([
#     layers.Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=sequence_length,trainable=False),
#     layers.LSTM(sequence_length),
#     layers.Dense(sequence_length*9, activation='softmax'),
#     layers.Reshape(output_shape)
    
# ])
# desired_learning_rate = 0.001

# # Create an Adam optimizer instance with the desired learning rate
# optimizer = Adagrad(learning_rate=desired_learning_rate)

# # Compile the model with the customized optimizer
# model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"])
# early_stopping = EarlyStopping(patience=10,monitor='accuracy')
# # Training (replace with your data)
# model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, y_dev),
#          callbacks = early_stopping, workers = 4)


In [23]:
# from scikeras.wrappers import KerasRegressor,KerasClassifier
# from sklearn.model_selection import GridSearchCV
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Embedding, Dropout
# from keras.optimizers import Adam

# num_classes=9
# sequence_length=50
# output_shape=(sequence_length,num_classes)

# # Function to create the model
# def create_model(units=128, dropout_rate=0.2, learning_rate=0.001):
#     model = Sequential()
#     model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=sequence_length, trainable=False))
#     model.add(LSTM(units=128, return_sequences=False))
#     model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.L1L2(l1=0.25, l2=0.25)))
#     model.add(Dropout(0.2))
#     model.add(Dense(num_classes*sequence_length, activation='softmax'))
#     #model.add(tf.keras.layers.Reshape(output_shape))

#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

# # Create KerasClassifier
# model = KerasRegressor(build_fn=create_model, epochs=5, batch_size=32, verbose=0)

# # Define the grid search parameters
# param_grid = {
#     'model__units': [64, 128, 256,512],
#     'model__dropout_rate': [0.1,0.2, 0.3, 0.4,0.5],
#     'model__learning_rate': [0.001, 0.01, 0.1]
# }

# # Perform GridSearchCV
# y_train_2d = y_train.reshape(y_train.shape[0], -1)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2)

In [24]:
# grid_result = grid.fit(x_train,y_train_2d)

# # Print the best parameters and the best accuracy
# print("Best Parameters: ", grid_result.best_params_)
# print("Best Accuracy: ", grid_result.best_score_)

In [37]:
from tensorflow.keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout,BatchNormalization,Bidirectional
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam
from sklearn.metrics import f1_score
from seqeval.metrics import f1_score as seq_f1_score
from tensorflow.keras.callbacks import Callback
from seqeval.scheme import IOB1

num_classes=9
sequence_length=50
output_shape=(sequence_length,num_classes)

class F1ScoreCallback(Callback):
    def __init__(self, validation_data,index_to_label):
        super().__init__()
        self.validation_data = validation_data
        self.index_to_label=index_to_label

    def on_epoch_end(self, epoch, logs=None):
        # X_val, y_val = self.validation_data[0], self.validation_data[1]
        # y_pred = self.model.predict(X_val)
        
        # y_true = tf.argmax(y_val, axis=-1)
        # y_pred = tf.argmax(y_pred, axis=-1)

        # y_true = y_true.numpy().reshape((-1, 1)).tolist()
        # y_pred = y_pred.numpy().reshape((-1, 1)).tolist()

        # f1 = f1_score(y_true, y_pred, average='weighted')
        # print(f' - F1 Score: {f1}')
        X_val, y_val = self.validation_data
        y_pred = self.model.predict(X_val)
        
        y_true = self.convert_to_labels(y_val)
        y_pred = self.convert_to_labels(y_pred)

        f1 = seq_f1_score(y_true, y_pred, scheme=IOB1, average='weighted')
        print(f' - F1 Score: {f1}')

    def convert_to_labels(self, data):
        labels = []
        for seq in data:
            label_seq = [self.index_to_label[np.argmax(token)] for token in seq]
            labels.append(label_seq)
        return labels

   
# Define and compile the model
model = Sequential()
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=sequence_length, trainable=False))
model.add(Bidirectional(LSTM(units=128, return_sequences=False)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.L1L2(l1=0.25, l2=0.25)))
model.add(Dense(num_classes*sequence_length, activation='softmax'))
model.add(tf.keras.layers.Reshape(output_shape))

optimizer = Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
class_weights={
    0:15502.18,      #'B-LOC',
    1:4872.11,     #'B-MISC',
    2:7105.16,      #'B-ORG',
    3:20.57,      #'I-LOC',
    4:37.41,      #'I-MISC',
    5:17.05,      #'I-ORG',
    6:15.32,      #'I-PER',
    7:1,      #'O',
    8:1      #'<PAD>'
}
# Implement callbacks
f1_callback = F1ScoreCallback(validation_data=(x_dev, y_dev),index_to_label=index_to_label)
early_stopping = EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 50)            150000100 
                                                                 
 bidirectional_5 (Bidirecti  (None, 256)               183296    
 onal)                                                           
                                                                 
 batch_normalization_5 (Bat  (None, 256)               1024      
 chNormalization)                                                
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 512)               131584    
                                                                 
 dense_11 (Dense)            (None, 450)              

In [38]:


num_epochs = 20
batch_size = 1024

model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=num_epochs, validation_data=(x_dev, y_dev), callbacks=[early_stopping,f1_callback, lr_scheduler], class_weight=class_weights)


# model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, y_dev),
#          callbacks = early_stopping, workers = 4)

Epoch 1/20




 - F1 Score: 0.03913393060085177
Epoch 2/20
 - F1 Score: 0.029651788772593002
Epoch 3/20
 - F1 Score: 0.02772534976144367
Epoch 4/20
 - F1 Score: 0.029782490725653678
Epoch 5/20
 - F1 Score: 0.024628884701075885
Epoch 6/20
 - F1 Score: 0.0270235644865823
Epoch 7/20
 - F1 Score: 0.018997384188154245
Epoch 8/20
 - F1 Score: 0.03413366998167083
Epoch 9/20
 2/15 [===>..........................] - ETA: 17s - loss: 44.0215 - accuracy: 0.6877

KeyboardInterrupt: 

In [39]:
from sklearn.metrics import f1_score

# y_pred = model.predict(x_test)        
# y_true = tf.argmax(y_test, axis=-1)
# y_pred = tf.argmax(y_pred, axis=-1)

# y_true = y_true.numpy().reshape((-1, 1)).tolist()
# y_pred = y_pred.numpy().reshape((-1, 1)).tolist()

# f1 = f1_score(y_true, y_pred, average='weighted')
# print(f' - F1 Score: {f1}')
predictions = model.predict(x_test)
print(predictions.shape)
print(y_test.shape)
# predictions = predictions.reshape(-1, sequence_length, num_classes)  # Reshape predictions to match the sequence_length
# true_labels = y_test.reshape(-1, sequence_length, num_classes)  # Reshape true labels similarly
predicted_label_sequences = []  # List to store predicted label sequences
true_label_sequences = []  # List to store true label sequences
for sequence in predictions:
    # Decode the predicted labels for each sequence
    decoded_sequence = [index_to_label[np.argmax(label,axis=-1)] for label in sequence]
    predicted_label_sequences.append(decoded_sequence)
for sequence in y_test:
    # Decode the true labels for each sequence
    decoded_sequence = [index_to_label[np.argmax(label,axis=-1)] for label in sequence]
    true_label_sequences.append(decoded_sequence)
print(predicted_label_sequences)
print(true_label_sequences)
f1 = seq_f1_score(true_label_sequences, predicted_label_sequences, scheme=IOB1, average='weighted')
print("F1 Score:", f1)

(3684, 50, 9)
(3684, 50, 9)


TypeError: unhashable type: 'numpy.ndarray'

In [28]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')


Test Loss: 1533.6507568359375, Test Accuracy: 0.12062975019216537
