In [24]:

import numpy as np
import nltk
from nltk.corpus import conll2002
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical



nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\salmank\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [25]:
# Load the NER dataset
sentences = conll2002.iob_sents()

print(sentences[0])
# Prepare the data
X = [[word[0] for word in sent] for sent in sentences]
y = [[label[2] for label in sent] for sent in sentences]


[('Sao', 'NC', 'B-LOC'), ('Paulo', 'VMI', 'I-LOC'), ('(', 'Fpa', 'O'), ('Brasil', 'NC', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('23', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFECOM', 'NP', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Create a mapping from words to integers
words = list(set(word.lower() for sentence in X_train for word in sentence))
words.append("ENDPAD")
n_words = len(words)

word2idx = {w: i for i, w in enumerate(words)}

# Create a mapping from labels to integers
labels = list(set(label for sentence in y_train for label in sentence))
n_labels = len(labels)

label2idx = {l: i for i, l in enumerate(labels)}
label2idx



{'I-LOC': 0,
 'I-ORG': 1,
 'B-PER': 2,
 'O': 3,
 'B-LOC': 4,
 'B-ORG': 5,
 'B-MISC': 6,
 'I-MISC': 7,
 'I-PER': 8}

In [28]:
# Set a maximum sentence length
max_len = 100

# Convert words and labels to integers and pad sequences
X_train = [[word2idx.get(w.lower(), 0) for w in s] for s in X_train]
X_test = [[word2idx.get(w.lower(), 0) for w in s] for s in X_test]
y_train = [[label2idx[l] for l in s] for s in y_train]
y_test = [[label2idx[l] for l in s] for s in y_test]


In [34]:
X_train = pad_sequences(X_train, maxlen=max_len, padding="post")
X_test = pad_sequences(X_test, maxlen=max_len, padding="post")
y_train = pad_sequences(y_train, maxlen=max_len, padding="post")
y_test = pad_sequences(y_test, maxlen=max_len, padding="post")


# One-hot encode the labels
y_train = [to_categorical(i, num_classes=n_labels) for i in y_train]
y_test = [to_categorical(i, num_classes=n_labels) for i in y_test]


In [35]:

print(X_train[5])
print(y_train[5])

[29108 12561  5725 23548 32931 15478 39268  7687 21367 12561 35037 27651
 21745 38883 21367 52538 21367  6377 24343 37353 46937 23548 13229 31832
 12054 16511 14128  6661 21367 12561 52341 10209  6377 24343  8608 46937
 24030 12561 25609 24030 39940 46687  4340     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[[0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [36]:
# Build the LSTM model
model = Sequential([Embedding(input_dim=n_words, output_dim=50, input_length=max_len),
                    Bidirectional(LSTM(100, return_sequences=True)),
                    TimeDistributed(Dense(n_labels, activation='softmax'))])
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, np.array(y_test))
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.0810, Test Accuracy: 0.9828
