In [24]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Lambda
from tensorflow.keras.utils import to_categorical


In [25]:
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets"
]



In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
embed_size = 50
context_size = 2


In [32]:
def generate_training_data(corpus, context_size):
    training_data = []
    for sentence in corpus:
        words = text_to_word_sequence(sentence)
        for index, target_word in enumerate(words):
            context_words = []
            # Collect context words
            for i in range(-context_size, context_size + 1):
                if i != 0 and 0 <= index + i < len(words):
                    context_words.append(word2id[words[index + i]])
            target_word_id = word2id[target_word]
            training_data.append((context_words, target_word_id))
    return training_data



In [33]:
training_data = generate_training_data(corpus, context_size)

X = []
y = []


for context, target in training_data:
    X.append(context)
    y.append(target)

X = pad_sequences(X, maxlen=context_size * 2, padding='pre')
y = to_categorical(np.array(y), num_classes=vocab_size)



In [34]:
# Step 3: Train Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=context_size * 2))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)



Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0556 - loss: 2.6419
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.0556 - loss: 2.6363
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.2778 - loss: 2.6308
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.3889 - loss: 2.6253
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.4444 - loss: 2.6198
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.4444 - loss: 2.6143
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4444 - loss: 2.6088
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.4444 - loss: 2.6033
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x7fd50e37dcc0>

In [35]:
# Step 4: Output
def predict_next_word(context):
    context_ids = [word2id[word] for word in context if word in word2id]
    context_ids = pad_sequences([context_ids], maxlen=context_size * 2, padding='pre')
    prediction = model.predict(context_ids)
    predicted_word_id = np.argmax(prediction, axis=-1)
    return id2word[predicted_word_id[0]]

In [36]:
context_example = ["the", "cat"]
predicted_word = predict_next_word(context_example)
print(f'Given context: {context_example}, Predicted next word: {predicted_word}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Given context: ['the', 'cat'], Predicted next word: the
