In [1]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.model_selection import train_test_split


In [2]:
nltk.download('treebank')
from nltk.corpus import treebank

# Get sentences with POS tags
sentences = treebank.tagged_sents()[:5000]  # take first 500 sentences for demo

# Separate words and tags
X_sentences = [[word for word, tag in sent] for sent in sentences]
y_tags = [[tag for word, tag in sent] for sent in sentences]


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [3]:
word_tokenizer = Tokenizer(lower=True, oov_token='OOV')
word_tokenizer.fit_on_texts(X_sentences)
X_seq = word_tokenizer.texts_to_sequences(X_sentences)
vocab_size = len(word_tokenizer.word_index) + 1

# Tag tokenizer
tag_tokenizer = Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(y_tags)
y_seq = tag_tokenizer.texts_to_sequences(y_tags)
num_tags = len(tag_tokenizer.word_index) + 1


In [4]:
max_len = max(len(s) for s in X_seq)
X_seq = pad_sequences(X_seq, maxlen=max_len, padding='post')
y_seq = pad_sequences(y_seq, maxlen=max_len, padding='post')
y_seq = np.array([to_categorical(i, num_classes=num_tags) for i in y_seq])


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)


In [6]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_shape=(max_len,)))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(TimeDistributed(Dense(num_tags, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [7]:
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=2)


Epoch 1/10
98/98 - 25s - 254ms/step - accuracy: 0.9117 - loss: 0.4053
Epoch 2/10
98/98 - 10s - 98ms/step - accuracy: 0.9516 - loss: 0.1901
Epoch 3/10
98/98 - 8s - 77ms/step - accuracy: 0.9750 - loss: 0.1001
Epoch 4/10
98/98 - 7s - 70ms/step - accuracy: 0.9888 - loss: 0.0512
Epoch 5/10
98/98 - 11s - 108ms/step - accuracy: 0.9939 - loss: 0.0294
Epoch 6/10
98/98 - 7s - 75ms/step - accuracy: 0.9961 - loss: 0.0189
Epoch 7/10
98/98 - 10s - 100ms/step - accuracy: 0.9972 - loss: 0.0134
Epoch 8/10
98/98 - 7s - 72ms/step - accuracy: 0.9978 - loss: 0.0102
Epoch 9/10
98/98 - 7s - 72ms/step - accuracy: 0.9981 - loss: 0.0082
Epoch 10/10
98/98 - 7s - 69ms/step - accuracy: 0.9985 - loss: 0.0068


<keras.src.callbacks.history.History at 0x7edc39e40380>

In [9]:
def predict_pos(sentence):
    seq = word_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(seq)
    pred_tags = [tag_tokenizer.index_word[np.argmax(p)] for p in pred[0][:len(sentence)]]
    return list(zip(sentence, pred_tags))


In [10]:
test_sentence = ["The", "market", "is", "growing"]
result = predict_pos(test_sentence)
print("Predicted POS tags:")
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted POS tags:
[('The', 'DT'), ('market', 'NN'), ('is', 'VBZ'), ('growing', 'VBG')]
