In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical

import more_itertools as mit
import joblib

In [2]:
# setting the seed value, so that the resutls can be reproduced
np.random.seed(42)
tf.random.set_seed(42)

# Setting max length of a sequence
MAX_LEN = 64

# Training batch size
TRAIN_BATCH_SIZE = 64

# Number of epochs for training the model
EPOCHS = 50

# Path where we want to save the model
MODEL_PATH = "model_LSTM_ENG.bin"

# Train and Test files
files = {
        "train": "./data/engtrain.bio", 
        "test": "./data/engtest.bio"
        }

# Dimension of word embeddings
EMBEDDING_DIM = 256

In [3]:
# We need to prepare the vocabulary using both train and test data set
def prepare_vocab(files):
    train_data = pd.read_csv(files['train'], sep="\t", names=['TAG', 'WORD'], skip_blank_lines=True, dtype="string", skipfooter=1)
    test_data = pd.read_csv(files['test'], sep="\t", names=['TAG', 'WORD'], skip_blank_lines=True, dtype="string", skipfooter=1)
    
    train_vocab = set(train_data['WORD'])
    test_vocab = set(test_data['WORD'])
    
    vocab = list(train_vocab.union(test_vocab))
    return vocab

In [4]:
# Function to read the dataset, pre process and prepare it for the model to consume for training

def read_process_and_prepare_data(files):
    
    # Getting the vocabulary
    vocab = prepare_vocab(files)
    
    # Constructing word to id and id to word dictionaries
    word2id_dict = dict((word, idx) for idx, word in enumerate(vocab))
    id2word_dict = {idx: word for word, idx in word2id_dict.items()}
    
    # Read the train dataset including blank lines (separator for each sentence)
    df = pd.read_csv(files['train'], sep="\t", names=['TAG', 'WORD'], skip_blank_lines=False, dtype="string", skipfooter=1)
    
    # Filling the blank lines which are read as null values in dataframe with a value "split_at"
    df.fillna("split_at", inplace=True)
    
    filt = df['TAG']!="split_at"
    tags_enc = LabelEncoder()
    
    # train the LabelEncoder on the 'TAG' column excluding the value "split_at"
    tags_enc.fit_transform(df.loc[filt, 'TAG'])
    
    # Construct the tags2id dictionary: ({"tag": <id>})
    tags2id_dict = dict(zip(tags_enc.classes_, tags_enc.transform(tags_enc.classes_)))
    
    # Replace the 'TAG' values with encoded values using dictionary constructed in above step
    df['TAG'] = df['TAG'].map(tags2id_dict)
    
    # "split_at" will be ignored while replacing the 'TAG' values, as we did not want the labelEncoder to encode this special token
    df['TAG'].fillna("split_at", inplace=True)
    
    # Replace each word in 'WORD' column using the word2id dictionary constructed previously ({"word": <id>}) using vocabulary
    df['WORD'] = df['WORD'].map(word2id_dict)
    
    # Since we did not include "split_at" in the vocabulary, while replacing words with id values, this will be skipped and will have NaN values
    # Replacing NaN values with "split_at"
    df['WORD'].fillna("split_at", inplace=True)
    
    # Constructing sentences from 'WORD' column splitting at "spli_at" token (because sentences are delimited with "split_at" token)
    sentences = np.array(list(mit.split_at(df['WORD'].tolist(), pred=lambda x: x=='split_at')))
    
    # For each sentence, construct list of tags for each word in that sentence
    tags = list(list(mit.split_at(df['TAG'], pred=lambda x: x=='split_at')))
    
    # Pad the tags to MAX_LEN with value -1
    padded_tags = pad_sequences(tags, maxlen=MAX_LEN, padding='post', value=-1)
    
    # Convert the tag for each word into onehot encoded vecotrs
    padded_tags = to_categorical(padded_tags)
    
    
    padding_value = len(vocab) + 1
    
    # Pad the sentences to MAX_LEN with the padding value as len(vocab)+1
    padded_sentences = pad_sequences(sentences, maxlen=MAX_LEN, padding='post', truncating='post', value=padding_value)
    
    # Construct a dictionary and return for later use
    dataset = {
                "sentences": sentences,
                "padded_sentences": padded_sentences,
                "tags": padded_tags,
                "tags_enc": tags_enc,
                "word2id": word2id_dict,
                "id2word": id2word_dict,
                "tags2id": tags2id_dict,
                "vocab_len": len(vocab)
              }
    
    return dataset

In [5]:
# Function to create the model

def create_model(input_dim, output_dim, seq_len, num_tags):
    model = Sequential()
    
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=seq_len))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True)))
    model.add(LSTM(units=output_dim, return_sequences=True))
    model.add(Dense(num_tags, activation="softmax"))
    
    optimizer = Adam(learning_rate = 0.001)
    
    # Using "categorical_crossentropy" loss as we are using one hot encoded representation of each tag
    model.compile(loss = 'categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    # Print the summar of the model constructed above and return the compiled model
    print(model.summary())
    
    return model

In [6]:
# Function to train the network

def train_network(model, X_train, y_train, X_val, y_val):
    
    # Using EarlyStopping as callbacks while calling fit method on the model as we monitor the loss on validation set.
    # This will prevent us from training for large number of epochs if there is no improvement in the validation loss
    early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
    
    # Training the model and saving the history of the model training information
    history = model.fit(x=X_train, y=y_train, batch_size = TRAIN_BATCH_SIZE, validation_data=(X_val, y_val), epochs = EPOCHS, callbacks=early_stopping)
    
    # Return the trained model and the history of it
    return history, model

In [7]:
# Function to read and prepare the test dataset

def prepare_test_data(file, tag_enc, tags2id_dict, word2id_dict, vocab_len):
    
    # Read the dataset including the blan line separators and fill the null values with token "split_at"
    df = pd.read_csv(file, sep="\t", names=['TAG', 'WORD'], skip_blank_lines=False, dtype="string", skipfooter=1)
    df.fillna("split_at", inplace=True)
    
    filt = df['TAG']!="split_at"
    
    # Using the encoder that is fit on train data to transform the tag values in test data
    tag_enc.transform(df.loc[filt, 'TAG'])
#     tags2id_dict = dict(zip(tags_enc.classes_, tags_enc.transform(tags_enc.classes_)))
    
    df['TAG'] = df['TAG'].map(tags2id_dict)
    
    # "split_at" token will be left out during encoding the tag values and will be replaced as NaN, so replacing NaN with "split_at"
    df['TAG'].fillna("split_at", inplace=True)
    
    # Using the vocab mapping dictionary of word to id, convert the words into their respective ids
    df['WORD'] = df['WORD'].map(word2id_dict)
    
    # Fill the NaN VALUES FOR WORD colum with "split_at"
    df['WORD'].fillna("split_at", inplace=True)
        
    # Construct the sentences with the help of "split_at" token using more_itertools
    sentences = np.array(list(mit.split_at(df['WORD'].tolist(), pred=lambda x: x=='split_at')))
    
    # For each sentence, get the list of tags for each word in that sentence
    tags = list(list(mit.split_at(df['TAG'], pred=lambda x: x=='split_at')))
    
    # Pad the tags for each sentence to have MAX_LEN
    padded_tags = pad_sequences(tags, maxlen=MAX_LEN, padding='post', value=-1)
    
    # Convert the tags to one hot encoded representation
    padded_tags = to_categorical(padded_tags)
    
    padding_value = vocab_len
    
    # Pad each sentences to have MAX_LEN
    padded_sentences = pad_sequences(sentences, maxlen=MAX_LEN, padding='post', truncating='post', value=padding_value)
    
    # Return the dictionary for later usage
    test_dataset = {
                    "sentences": sentences,
                    "padded_sentences": padded_sentences,
                    "tags": tags,
                    "padded_tags": padded_tags
                    }
    
    return test_dataset

In [8]:
if __name__ == '__main__':
    
    # Read the data
    dataset = read_process_and_prepare_data(files)
    
    # Split the training data into train and val data with 10% of train data set aside as validation data
    X_train, X_val, y_train, y_val = train_test_split(dataset['padded_sentences'], np.array([np.array(x) for x in dataset['tags']]), random_state=42, test_size=0.2)
    
    # Creating the model
    model = create_model(input_dim=dataset['vocab_len'] + 2, output_dim=EMBEDDING_DIM, seq_len=MAX_LEN, num_tags=len(dataset['tags_enc'].classes_))
    
    # Calling the function to train the network (model)
    history, model = train_network(model, np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val))
    

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  del sys.path[0]


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 64, 256)           1915648   
_________________________________________________________________
bidirectional (Bidirectional (None, 64, 512)           1050624   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64, 256)           787456    
_________________________________________________________________
dense (Dense)                (None, 64, 25)            6425      
Total params: 3,760,153
Trainable params: 3,760,153
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


## Model performance on Test dataset

In [10]:
# Check the trained model performance on the test dataset

# Read and prepare the test dataset
test_dataset = prepare_test_data(file=files['test'], tag_enc=dataset['tags_enc'], tags2id_dict=dataset['tags2id'], word2id_dict=dataset['word2id'], vocab_len=dataset['vocab_len'])

# Predict the tags for the teset dataset using the trained model
test_predictions = model.predict(test_dataset['padded_sentences'])

# Get the predictions of tag for each word in the sentence for each sentence
predicted_tags = []
for i in range(len(test_dataset['tags'])):
    predicted_tags.append( np.argmax(test_predictions[i], axis=1)[:len(test_dataset['sentences'][i])] )

# Get the classification report to check the model performance
print(classification_report(np.concatenate(test_dataset['tags']), np.concatenate(predicted_tags)))

  


              precision    recall  f1-score   support

         0.0       0.84      0.92      0.88       812
         1.0       0.28      0.10      0.15        90
         2.0       0.88      0.78      0.83       456
         3.0       0.89      0.93      0.91      1117
         4.0       0.55      0.60      0.57       491
         5.0       0.98      0.97      0.97       500
         6.0       0.91      0.74      0.82       451
         7.0       0.00      0.00      0.00        56
         8.0       0.84      0.39      0.53        54
         9.0       0.77      0.33      0.46       562
        10.0       0.00      0.00      0.00        30
        11.0       0.96      0.93      0.94       720
        12.0       0.82      0.93      0.87       862
        13.0       0.43      0.08      0.13        75
        14.0       0.92      0.77      0.84       496
        15.0       0.87      0.68      0.76       222
        16.0       0.63      0.36      0.46       496
        17.0       0.93    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Model performance on Train dataset

In [12]:
# Check the trained model performance on the train dataset

# Read and prepare the test dataset
dataset = prepare_test_data(file=files['train'], tag_enc=dataset['tags_enc'], tags2id_dict=dataset['tags2id'], word2id_dict=dataset['word2id'], vocab_len=dataset['vocab_len'])

# Predict the tags for the teset dataset using the trained model
train_predictions = model.predict(dataset['padded_sentences'])

# Get the predictions of tag for each word in the sentence for each sentence
predicted_tags = []
for i in range(len(dataset['tags'])):
    predicted_tags.append(np.argmax(train_predictions[i], axis=1)[:len(dataset['sentences'][i])])

# Get the classification report to check the model performance
print(classification_report(np.concatenate(dataset['tags']), np.concatenate(predicted_tags)))

  


              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91      3220
         1.0       0.45      0.21      0.29       385
         2.0       0.93      0.91      0.92      1720
         3.0       0.93      0.95      0.94      4354
         4.0       0.74      0.75      0.75      1927
         5.0       0.98      0.97      0.97      2007
         6.0       0.90      0.76      0.83      1869
         7.0       0.00      0.00      0.00       221
         8.0       0.88      0.40      0.55       245
         9.0       0.92      0.44      0.60      2376
        10.0       0.00      0.00      0.00       113
        11.0       0.97      0.94      0.95      2858
        12.0       0.87      0.96      0.91      3474
        13.0       0.55      0.11      0.18       342
        14.0       0.96      0.93      0.95      1850
        15.0       0.94      0.80      0.86       786
        16.0       0.85      0.57      0.68      1687
        17.0       0.98    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
