In [32]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split


In [33]:
# --- 1. Create a Simple Synthetic Dataset ---
# Input sentences and corresponding tags (O for other, ACTION for action word)
sentences = [
    "I am walking home",
    "She is reading a book",
    "They are playing soccer",
    "We like to eat pizza",
    "He loves to sing",
    "Birds are flying high",
    "Fish can swim fast",
    "Dogs love to run",
    "Cats like to sleep",
    "The sun is shining"
]

In [34]:
# Corresponding tag sequences (one tag per word)
tags = [
    ["O", "O", "ACTION", "O"],
    ["O", "O", "ACTION", "O", "O"],
    ["O", "O", "ACTION", "O"],
    ["O", "O", "O", "ACTION", "O"],
    ["O", "ACTION", "O", "ACTION"],
    ["O", "O", "ACTION", "O"],
    ["O", "O", "ACTION", "O", "O"],
    ["O", "ACTION", "O", "ACTION"],
    ["O", "ACTION", "O", "ACTION"],
    ["O", "O", "O", "ACTION"]
]

In [35]:
# --- 2. Preprocess Data: Tokenization and Padding ---

# Tokenize words in sentences
word_tokenizer = Tokenizer(oov_token="<OOV>") # Handle out-of-vocabulary words
word_tokenizer.fit_on_texts(sentences)
word_sequences = word_tokenizer.texts_to_sequences(sentences)


In [36]:
word_tokenizer, word_sequences

(<keras.src.legacy.preprocessing.text.Tokenizer at 0x7c337e5a5490>,
 [[6, 7, 8, 9],
  [10, 3, 11, 12, 13],
  [14, 4, 15, 16],
  [17, 5, 2, 18, 19],
  [20, 21, 2, 22],
  [23, 4, 24, 25],
  [26, 27, 28, 29],
  [30, 31, 2, 32],
  [33, 5, 2, 34],
  [35, 36, 3, 37]])

In [37]:
# Tokenize tags
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(tags)
tag_sequences = tag_tokenizer.texts_to_sequences(tags)

In [38]:
tag_sequences

[[1, 1, 2, 1],
 [1, 1, 2, 1, 1],
 [1, 1, 2, 1],
 [1, 1, 1, 2, 1],
 [1, 2, 1, 2],
 [1, 1, 2, 1],
 [1, 1, 2, 1, 1],
 [1, 2, 1, 2],
 [1, 2, 1, 2],
 [1, 1, 1, 2]]

In [39]:
word_tokenizer.word_index

{'<OOV>': 1,
 'to': 2,
 'is': 3,
 'are': 4,
 'like': 5,
 'i': 6,
 'am': 7,
 'walking': 8,
 'home': 9,
 'she': 10,
 'reading': 11,
 'a': 12,
 'book': 13,
 'they': 14,
 'playing': 15,
 'soccer': 16,
 'we': 17,
 'eat': 18,
 'pizza': 19,
 'he': 20,
 'loves': 21,
 'sing': 22,
 'birds': 23,
 'flying': 24,
 'high': 25,
 'fish': 26,
 'can': 27,
 'swim': 28,
 'fast': 29,
 'dogs': 30,
 'love': 31,
 'run': 32,
 'cats': 33,
 'sleep': 34,
 'the': 35,
 'sun': 36,
 'shining': 37}

In [40]:
# Get vocabulary sizes and index mappings
word_vocab_size = len(word_tokenizer.word_index) + 1 # +1 for padding token 0
tag_vocab_size = len(tag_tokenizer.word_index) + 1   # +1 for padding token 0


In [41]:
word_index = word_tokenizer.word_index
tag_index = tag_tokenizer.word_index
index_to_tag = {v: k for k, v in tag_index.items()} # Mapping back from index to tag name


In [42]:
word_index

{'<OOV>': 1,
 'to': 2,
 'is': 3,
 'are': 4,
 'like': 5,
 'i': 6,
 'am': 7,
 'walking': 8,
 'home': 9,
 'she': 10,
 'reading': 11,
 'a': 12,
 'book': 13,
 'they': 14,
 'playing': 15,
 'soccer': 16,
 'we': 17,
 'eat': 18,
 'pizza': 19,
 'he': 20,
 'loves': 21,
 'sing': 22,
 'birds': 23,
 'flying': 24,
 'high': 25,
 'fish': 26,
 'can': 27,
 'swim': 28,
 'fast': 29,
 'dogs': 30,
 'love': 31,
 'run': 32,
 'cats': 33,
 'sleep': 34,
 'the': 35,
 'sun': 36,
 'shining': 37}

In [43]:
tag_index

{'o': 1, 'action': 2}

In [44]:
print(f"Word vocabulary size: {word_vocab_size}")
print(f"Tag vocabulary size: {tag_vocab_size}")
print(f"Word index: {word_index}")
print(f"Tag index: {tag_index}")


Word vocabulary size: 38
Tag vocabulary size: 3
Word index: {'<OOV>': 1, 'to': 2, 'is': 3, 'are': 4, 'like': 5, 'i': 6, 'am': 7, 'walking': 8, 'home': 9, 'she': 10, 'reading': 11, 'a': 12, 'book': 13, 'they': 14, 'playing': 15, 'soccer': 16, 'we': 17, 'eat': 18, 'pizza': 19, 'he': 20, 'loves': 21, 'sing': 22, 'birds': 23, 'flying': 24, 'high': 25, 'fish': 26, 'can': 27, 'swim': 28, 'fast': 29, 'dogs': 30, 'love': 31, 'run': 32, 'cats': 33, 'sleep': 34, 'the': 35, 'sun': 36, 'shining': 37}
Tag index: {'o': 1, 'action': 2}


In [45]:
# Determine max sequence length for padding
max_len = max(len(seq) for seq in word_sequences)
print(f"Max sequence length: {max_len}")

Max sequence length: 5


In [46]:
# Pad sequences (both input words and output tags)
X = pad_sequences(word_sequences, maxlen=max_len, padding='post') # Pad with 0 after the sequence
y = pad_sequences(tag_sequences, maxlen=max_len, padding='post') # Pad target sequences too


In [47]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Padded X_train shape: {X_train.shape}")
print(f"Padded y_train shape: {y_train.shape}")

Padded X_train shape: (7, 5)
Padded y_train shape: (7, 5)


In [48]:
# --- 3. Build the LSTM Model for Sequence Tagging ---

embedding_dim = 16 # Size of the word embedding vectors
lstm_units = 32    # Number of units in the LSTM layer

model = Sequential()
# Embedding layer: Maps word indices to dense vectors
model.add(Embedding(input_dim=word_vocab_size, output_dim=embedding_dim, input_length=max_len))

# LSTM layer: Processes the sequence, returning a sequence of hidden states
# return_sequences=True is essential for sequence tagging tasks
model.add(LSTM(units=lstm_units, return_sequences=True))

# TimeDistributed Dense layer: Applies a Dense layer independently to each time step
# This allows the model to predict a tag for each word in the sequence
model.add(TimeDistributed(Dense(units=tag_vocab_size, activation='softmax'))) # Output probability distribution over tags for each step

# Print the model summary
model.summary()




In [49]:
# --- 4. Compile the Model ---

# Use sparse_categorical_crossentropy because our targets (y) are integer indices
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [50]:
# --- 5. Train the Model ---

print("\nTraining model...")
batch_size = 4 # Small batch size for this tiny dataset
epochs = 50  # Train for more epochs due to small data

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2, # Use 20% of training data for validation
                    verbose=0) # Suppress verbose output for tiny data

print("Training finished.")


Training model...
Training finished.


In [51]:
# --- 6. Evaluate the Model ---

print("\nEvaluating model...")
# Note: Evaluation on padded sequences might not be perfectly representative
# if accuracy includes predictions on padding tokens. A masked loss/metric is better
# for real tasks, but sparse_categorical_crossentropy often handles padding=0 implicitly.
loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=0)

print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}") # This is per-token accuracy



Evaluating model...

Test Loss: 0.8136
Test Accuracy: 0.6667


In [52]:
# --- 7. Make and Display a Prediction ---

print("\nExample Prediction:")
test_sentence_raw = "Fish are swimming fast"
test_sequence = word_tokenizer.texts_to_sequences([test_sentence_raw])
test_sequence_padded = pad_sequences(test_sequence, maxlen=max_len, padding='post')



Example Prediction:


In [53]:

# Predict probabilities for each tag at each time step
predictions = model.predict(test_sequence_padded) # Shape: (1, max_len, tag_vocab_size)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 376ms/step


In [54]:
# Get the index of the tag with the highest probability for each word
predicted_tags_indices = np.argmax(predictions, axis=-1) # Shape: (1, max_len)

In [55]:
# Convert indices back to tag names, ignoring padding
predicted_tags = [index_to_tag[idx] for idx in predicted_tags_indices[0] if idx != 0] # Exclude padding tag index 0

In [56]:
print(f"Original sentence: {test_sentence_raw}")
# Split the raw sentence into words to pair with tags (handle potential tokenizer differences)
original_words = test_sentence_raw.split()
# Only print tags for the actual words, up to the length of the original sentence
print(f"Predicted tags:    {predicted_tags[:len(original_words)]}")

Original sentence: Fish are swimming fast
Predicted tags:    ['o', 'o', 'o', 'o']
