Named Entity Recognition (NER)

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# --------------------------------------------
# Step 1: Synthetic Dataset
# --------------------------------------------
sentences = [
    ["John", "works", "at", "Google", "in", "New", "York", "."],
    ["Microsoft", "is", "based", "in", "Seattle", ",", "Washington", "."],
    ["Elon", "Musk", "founded", "SpaceX", "and", "Tesla", "."],
    ["Berlin", "is", "the", "capital", "of", "Germany", "."]
]

original_tags = [
    ["B-PER", "O", "O", "B-ORG", "O", "B-LOC", "I-LOC", "O"],
    ["B-ORG", "O", "O", "O", "B-LOC", "O", "B-LOC", "O"],
    ["B-PER", "I-PER", "O", "B-ORG", "O", "B-ORG", "O"],
    ["B-LOC", "O", "O", "O", "O", "B-LOC", "O"]
]

# --------------------------------------------
# Step 2: Preprocessing
# --------------------------------------------
# Create word-to-index mapping
words = list(set(word for sentence in sentences for word in sentence))
words.append("<PAD>")  # Padding token
words.append("<UNK>")  # Unknown token
word_to_idx = {word: idx for idx, word in enumerate(words)}

# Create tag-to-index mapping
unique_tags = list(set(tag for tags in original_tags for tag in tags))
unique_tags.append("<PAD>")  # Padding tag
tag_to_idx = {tag: idx for idx, tag in enumerate(unique_tags)}

# Convert sentences and tags to indices
X = [[word_to_idx.get(word, word_to_idx["<UNK>"]) for word in sentence] for sentence in sentences]
y = [[tag_to_idx[tag] for tag in tags] for tags in original_tags]

# Pad sequences
max_seq_length = 10
X_padded = pad_sequences(X, maxlen=max_seq_length, padding="post", value=word_to_idx["<PAD>"])
y_padded = pad_sequences(y, maxlen=max_seq_length, padding="post", value=tag_to_idx["<PAD>"])

# Reshape y for training
y_padded = y_padded.reshape(*y_padded.shape, 1)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2)

# --------------------------------------------
# Step 3: Build LSTM Model
# --------------------------------------------
model = models.Sequential([
    layers.Embedding(
        input_dim=len(word_to_idx), 
        output_dim=64, 
        input_length=max_seq_length
    ),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Dropout(0.3),
    layers.TimeDistributed(layers.Dense(len(unique_tags), activation="softmax"))
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# --------------------------------------------
# Step 4: Train the Model
# --------------------------------------------
model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=2,
    validation_data=(X_test, y_test)
)

# --------------------------------------------
# Step 5: Predict on New Sentences
# --------------------------------------------
def predict_entities(sentence):
    # Convert sentence to indices
    sentence_indices = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in sentence]
    padded_sentence = pad_sequences(
        [sentence_indices], 
        maxlen=max_seq_length, 
        padding="post", 
        value=word_to_idx["<PAD>"]
    )
    # Predict tags
    predictions = model.predict(padded_sentence)
    predicted_indices = np.argmax(predictions, axis=-1)[0]
    # Map indices to tags
    idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}
    predicted_tags = [idx_to_tag[idx] for idx in predicted_indices if idx != tag_to_idx["<PAD>"]]
    return predicted_tags[:len(sentence)]  # Remove padding


Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 459ms/step - accuracy: 0.2111 - loss: 1.9431 - val_accuracy: 0.3000 - val_loss: 1.9352
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.4833 - loss: 1.9165 - val_accuracy: 0.4000 - val_loss: 1.9191
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.5444 - loss: 1.8901 - val_accuracy: 0.4000 - val_loss: 1.8998
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.6000 - loss: 1.8543 - val_accuracy: 0.4000 - val_loss: 1.8756
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.5611 - loss: 1.8102 - val_accuracy: 0.5000 - val_loss: 1.8448
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.6611 - loss: 1.7481 - val_accuracy: 0.5000 - val_loss: 1.8067
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━

In [11]:

# Test Example
test_sentence = ["Tim", "Cook", "is", "CEO", "of", "Apple", "in", "California", "."]
predicted_tags = predict_entities(test_sentence)
print("Predicted Tags:", predicted_tags)
# Output Example: ["B-PER", "I-PER", "O", "O", "O", "B-ORG", "O", "B-LOC", "O"]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Predicted Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O']


In [1]:
sentences = [
    ["John", "works", "at", "Google", "in", "New", "York"],
    ["Apple", "is", "based", "in", "Cupertino", ",", "California"],
    ["Elon", "Musk", "leads", "Tesla", "and", "SpaceX"]
]

tags = [
    ["B-PER", "O", "O", "B-ORG", "O", "B-LOC", "I-LOC"],
    ["B-ORG", "O", "O", "O", "B-LOC", "O", "B-LOC"],
    ["B-PER", "I-PER", "O", "B-ORG", "O", "B-ORG"]
]

In [2]:
# Create word-to-index and tag-to-index mappings
words = list(set(word for sentence in sentences for word in sentence))
tags = list(set(tag for sentence_tags in tags for tag in sentence_tags))

word_to_idx = {word: idx+1 for idx, word in enumerate(words)}  # +1 for padding
tag_to_idx = {tag: idx for idx, tag in enumerate(tags)}

# Add padding token
word_to_idx["<PAD>"] = 0
tag_to_idx["<PAD>"] = 0

In [5]:
sentences = [
    ["John", "works", "at", "Google", "in", "New", "York"],
    ["Apple", "is", "based", "in", "Cupertino", ",", "California"],
    ["Elon", "Musk", "leads", "Tesla", "and", "SpaceX"]
]

original_tag_sequences = [
    ["B-PER", "O", "O", "B-ORG", "O", "B-LOC", "I-LOC"],
    ["B-ORG", "O", "O", "O", "B-LOC", "O", "B-LOC"],
    ["B-PER", "I-PER", "O", "B-ORG", "O", "B-ORG"]
]

# 1. Create word-to-index mapping
words = list(set(word for sentence in sentences for word in sentence))
word_to_idx = {word: idx+1 for idx, word in enumerate(words)}
word_to_idx["<PAD>"] = 0

# 2. Create tag-to-index mapping (use a new variable `unique_tags`)
unique_tags = list(set(tag for sentence_tags in original_tag_sequences for tag in sentence_tags))
tag_to_idx = {tag: idx for idx, tag in enumerate(unique_tags)}
tag_to_idx["<PAD>"] = 0

# 3. Convert sentences and tags to indices
X = [[word_to_idx[word] for word in sentence] for sentence in sentences]
y = [[tag_to_idx[tag] for tag in sentence_tags] for sentence_tags in original_tag_sequences]

# 4. Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_padded = pad_sequences(X, maxlen=10, padding="post", value=word_to_idx["<PAD>"])
y_padded = pad_sequences(y, maxlen=10, padding="post", value=tag_to_idx["<PAD>"])

In [7]:
print(y_padded[1])  # Output: [1, 5, 5, 2, 5, 3, 4, 0, 0, 0]
# Corresponding to: [B-PER, O, O, B-ORG, O, B-LOC, I-LOC, <PAD>, <PAD>, <PAD>]

[2 5 5 5 4 5 4 0 0 0]
