In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=01688d83b103240a09b57e48d7ca2e7b60a29f5d0524717c22a0026d6afd9a72
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


1. Load the data in google Drive

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
path ="/content/drive/My Drive/"
os.chdir(path)
# os.listdir(path)

Mounted at /content/drive


2. Process data

In [None]:
import tensorflow as tf
import numpy as np
import os
conll2003_path = "/content/drive/MyDrive/nlp/A2"
datasetpath= "/content/drive/MyDrive/nlp/A2"
def load_file(path = "/train.txt"):
    # Load the dataset
    train_sentences = []
    train_labels = []
    with open(conll2003_path + path) as f:
        sentence = []
        labels = []
        for line in f:
            line = line.strip()
            if line: # Split each line into four parts: word, part of speech, block, and label
                word, pos, chunk, label = line.split()
                sentence.append(word)
                labels.append(label)
            else:
                train_sentences.append(sentence)
                train_labels.append(labels)
                sentence = []
                labels = []
    return train_sentences, train_labels

In [None]:
max_len=64
def preproces(word2idx, tag2idx, num_tags, train_sentences,  train_labels):
    # Convert sentences and labels to numerical sequences
    x = [[word2idx[word.lower()] for word in sentence] for sentence in train_sentences]
    x = tf.keras.preprocessing.sequence.pad_sequences(maxlen=max_len, sequences=x, padding="post", value=0)
    y = [[tag2idx[tag] for tag in labels] for labels in train_labels]
    y = tf.keras.preprocessing.sequence.pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    y = tf.keras.utils.to_categorical(y, num_tags)
    return x, y

In [None]:
def get_dataset():
    # Load the dataset
    train_sentences, train_labels = load_file("/train.txt")
    valid_sentences, valid_labels = load_file("/valid.txt")
    test_sentences, test_labels = load_file("/test.txt")
    # Create vocabulary and tag dictionaries
    all_sentencses = np.concatenate([train_sentences, valid_sentences,test_sentences])
    all_labels = np.concatenate([train_labels, valid_labels, test_labels])
    vocab = set()
    tags = set()
    for sentence in all_sentencses:
        for word in sentence:
            vocab.add(word.lower())
    word2idx = {}
    if len(word2idx) == 0:
        word2idx["PADDING_TOKEN"] = len(word2idx)
        word2idx["UNKNOWN_TOKEN"] = len(word2idx)
    for word in vocab:
        word2idx[word] = len(word2idx)

    for labels in all_labels:
        for label in labels:
            tags.add(label)
    tag2idx = {t: i for i, t in enumerate(tags)}
    save_dict(word2idx, os.path.join(conll2003_path, 'word2idx.json'),)
    save_dict(tag2idx, os.path.join(conll2003_path, 'idx2Label.json'),)
    num_words = len(word2idx) + 1
    num_tags = len(tag2idx)
    train_X, train_y = preproces(word2idx, tag2idx, num_tags, train_sentences, train_labels);
    valid_X, valid_y = preproces(word2idx, tag2idx, num_tags, valid_sentences, valid_labels);
    test_X, test_y = preproces(word2idx, tag2idx, num_tags, test_sentences, test_labels);
    np.savez( os.path.join(conll2003_path ,'dataset.npz'), train_X = train_X, train_y = train_y, valid_X = valid_X, valid_y =valid_y , test_X =test_X, test_y= test_y)
    return train_X, train_y, valid_X, valid_y , test_X, test_y

In [None]:
def save_dict(dict, file_path):
    import json
    # Saving the dictionary to a file
    with open(file_path, 'w') as f:
        json.dump(dict, f)
def load_dict(path_file):
    import json
    # Loading the dictionary from the file
    with open(path_file, 'r') as f:
        loaded_dict = json.load(f)
        return loaded_dict;
    print(loaded_dict)

In [None]:
train_X, train_y, valid_X, valid_y , test_X, test_y=get_dataset()

In [None]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
import keras as keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
import numpy as np

3. Use GPU to accelerate

In [None]:
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TensorFlow")

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

Default GPU Device: /device:GPU:0


4. Define LSTM Model

In [None]:
def create_model():
    word2idx = load_dict('/content/drive/MyDrive/nlp/A2/word2idx.json')
    tag2idx = load_dict('/content/drive/MyDrive/nlp/A2/idx2Label.json')
    num_words = len(word2idx) + 1
    num_tags = len(tag2idx)

    # Define the model
    input_layer = Input(shape=(None,))
    embedding_layer = Embedding(input_dim=num_words, output_dim=60, input_length=max_len)(input_layer)
    lstm_layer = LSTM(units=50, return_sequences=True, dropout=0.5)(embedding_layer)
    #lstm_layer2 = LSTM(units=50, return_sequences=True, dropout=0.5)(lstm_layer)
    #lstm_layer3 = LSTM(units=50, return_sequences=True, dropout=0.5)(lstm_layer2)
    output_layer = TimeDistributed(Dense(num_tags, activation="softmax"))(lstm_layer)

    model = Model(input_layer, output_layer)

    return model

In [None]:
def train( model,  train_X, train_y, valid_X, valid_y):
    # Define the path and file name for saving the model
    model_path = '/content/drive/MyDrive/nlp/A2/lstm.h5'
    # Define early stop callback function
    early_stop = EarlyStopping(monitor='val_accuracy', patience=3, mode='max', verbose=1)
    # Define the ModelCheckpoint callback function
    checkpoint = ModelCheckpoint(model_path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
    # Compile and train the model
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    model.fit(train_X, train_y, batch_size=32, epochs=20, validation_data=(valid_X, valid_y), callbacks=[early_stop, checkpoint])

In [None]:
def test(test_X, test_y ):
    model = keras.models.load_model('/content/drive/MyDrive/nlp/A2/lstm.h5')
    # Evaluation Model
    scores = model.evaluate(test_X, test_y, verbose=0)
    print("Test Accuracy:", scores[1])
    #print(scores)

5. Train & Eval the Model

In [None]:
model= create_model()
train(model, np.concatenate([train_X, valid_X]), np.concatenate([train_y, valid_y]),test_X, test_y)
test(test_X, test_y)

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.96858, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 2/20
Epoch 2: val_accuracy improved from 0.96858 to 0.97257, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.97257 to 0.97618, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.97618 to 0.97848, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 5/20
Epoch 5: val_accuracy improved from 0.97848 to 0.98097, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.98097
Epoch 7/20
Epoch 7: val_accuracy improved from 0.98097 to 0.98257, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 8/20
Epoch 8: val_accuracy improved from 0.98257 to 0.98273, saving model to /content/drive/MyDrive/nlp/A2/lstm.h5
Epoch 9/20
Epoch 9: val_accuracy did not improve from 0.98273
Epoch 10/20
Epoch 10: val_accura

6. Predict the Labels

In [None]:
predictions = model.predict(test_X)
# Convert predicted tags back to labels
predicted_labels = []
predicted_tags = tf.argmax(predictions, axis=-1)
word2idx = load_dict('/content/drive/MyDrive/nlp/A2/word2idx.json')
tag2idx = load_dict('/content/drive/MyDrive/nlp/A2/idx2Label.json')
for tags in predicted_tags:
    labels = [list(tag2idx.keys())[tag] for tag in tags if tag != 0]
    predicted_labels.append(labels)



In [None]:
test_sentences, test_labels = load_file("/test.txt")
predicted_labels2=predicted_labels
# Unify the dimension
for i in range(len(test_labels)):
  test_labels[i]=test_labels[i][:64]
  predicted_labels2[i]=predicted_labels2[i]+['O']*(64-len(predicted_labels2[i]))
  predicted_labels2[i]=predicted_labels2[i][:len(test_labels[i])]

7. Use Segeval to Evaluate the Tagger on the Test Set

In [None]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.metrics import accuracy_score,precision_score,recall_score
print("Accuracy Score : ",accuracy_score(test_labels, predicted_labels2))
print("Precision Score : ",precision_score(test_labels, predicted_labels2))
print("Recall Score : ",recall_score(test_labels, predicted_labels2))
print("F1 Score : ",f1_score(test_labels, predicted_labels2))
print("-"*30)
print("Classification_Report")
print(classification_report(test_labels, predicted_labels2))

Accuracy Score :  0.9013639780904307
Precision Score :  0.56312625250501
Recall Score :  0.5993601137575542
F1 Score :  0.580678491475805
------------------------------
Classification_Report
              precision    recall  f1-score   support

         LOC       0.66      0.70      0.68      1661
        MISC       0.56      0.55      0.56       702
         ORG       0.42      0.59      0.49      1661
         PER       0.71      0.52      0.60      1602

   micro avg       0.56      0.60      0.58      5626
   macro avg       0.59      0.59      0.58      5626
weighted avg       0.59      0.60      0.59      5626



8. Save the Prediction

In [None]:
ts, test_labels = load_file("/test.txt")
text=''
for i in range(len(ts)):
  for j in range(len(ts[i])):
    a=''
    if(j>=len(predicted_labels[i])):
      a=ts[i][j]+' O\n'
    else:
      a=ts[i][j]+' '+predicted_labels[i][j]+'\n'
    text=text+a
  text=text+'\n'
#print(text)

In [None]:
with open("/content/drive/MyDrive/nlp/A2/lstm.test.txt", "w") as file:
  file.write(text)