In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import re

In [None]:
# Step 1: Prepare Data (Toy Corpus for Next-Word Prediction)
# Sentences: Simple dataset including "Wang loves Noodle"
corpus = [
    "Wang loves Noodle",
    "I love food",
    "She eats apple",
    "She likes music"
]
corpus

['Wang loves Noodle', 'I love food', 'She eats apple', 'She likes music']

In [None]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size (e.g., 10)
print(f"Vocabulary Size: {vocab_size}")
print("Word Index:", tokenizer.word_index)

Vocabulary Size: 12
Word Index: {'she': 1, 'wang': 2, 'loves': 3, 'noodle': 4, 'i': 5, 'love': 6, 'food': 7, 'eats': 8, 'apple': 9, 'likes': 10, 'music': 11}


This below code creates **n-gram** sequences from each sentence in your corpus, using the indices from your tokenizer.

[[2, 3],       # "Wang loves"

 [2, 3, 4],    # "Wang loves Noodle"

 [5, 6],       # "I love"

 [5, 6, 7],    # "I love food"

 [1, 8],       # "She eats"

 [1, 8, 9],    # "She eats apple"

 [1, 10],      # "She likes"

 [1, 10, 11]]  # "She likes music"

In [None]:
# Create sequences: For each sentence, create input (all but last word) and target (next word)
sequences = []
for sentence in corpus:
    tokens = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokens)):
        sequences.append(tokens[:i+1])  # Input: up to i, Target: i+1 (next word)
sequences

[[2, 3], [2, 3, 4], [5, 6], [5, 6, 7], [1, 8], [1, 8, 9], [1, 10], [1, 10, 11]]

Let’s break down below code step by step, why each operation is used, and clarify your confusion at the end.

---

## **1. Pad Sequences to Fixed Length**

```python
max_length = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_length, padding='pre')
```
- **Why pad?**  
  Neural networks require all inputs to have the **same length** for batch processing. Sentences (or n-gram sequences) are often different lengths.
- **What does `padding='pre'` do?**  
  Adds zeros **at the start** of sequences shorter than `max_length`.
- **Result:**  
  `X` is now a 2D array, each row is a sequence of length `max_length`.

---

## **2. Create One-Hot Encoded Targets (y)**

```python
y = to_categorical([seq[-1] for seq in sequences], num_classes=vocab_size)
```
- **What is this doing?**  
  - `seq[-1]` gets the last item in each n-gram sequence: **the next word to predict (the label).**
  - `to_categorical` converts these word indices to **one-hot encoded vectors** of length `vocab_size`.
    - Example: if `seq[-1] == 4` and `vocab_size == 12`, the vector will be `[0, 0, 0, 1, 0, ..., 0]` (with `1` in position 4).

---

## **3. Remove Last Word from Input for Prediction**

```python
X = X[:, :-1]  # Remove last word from input for prediction
```
- **Why?**  
  - Each n-gram sequence looks like `[w1, w2, ..., wn]`.
  - The task is: **given [w1, w2, ..., wn-1], predict wn**.
  - After padding, the last column is the current target (word to be predicted). So you trim it from the features.
- **Result:**  
  `X` is now only the input words (not including the prediction/last word in each sequence).

---

## **4. About the Targets (y)**

> y = y[:, :-1]  # Adjust target accordingly? Wait, no: target is the last word of each sequence

- **Your Instinct Is Correct:**  
  **DO NOT SLICE your target y** like you did with X.
  - Targets should stay as the one-hot vectors **representing only the next word**.
  - The slicing of X is to ensure that features do not include the word you’re supposed to predict.

---

## **Putting It All Together**

### **Final Shapes and Usage:**

- **X shape:** `(num_sequences, max_length - 1)`  
  Input: all words in a sequence except the last one (padded in front if needed).
- **y shape:** `(num_sequences, vocab_size)`  
  Output: one-hot vector representing the correct next word.

---

## **Diagram**

| Sequence         | After Padding | X (input)  | y (target)      |
|:-----------------|:-------------|:-----------|:----------------|
| [2, 3]           | [0, 2, 3]    | [0, 2]     | one-hot[3]      |
| [2, 3, 4]        | [2, 3, 4]    | [2, 3]     | one-hot[4]      |
| [5, 6]           | [0, 5, 6]    | [0, 5]     | one-hot[6]      |
| ...              | ...          | ...        | ...             |

---

### **Summary**

- **Pad all sequences** to the same length for neural network input.
- **X:** all but last word (padded)  
- **y:** last word as one-hot vector
- **Never slice y** like you do X. Targets should represent only the word you’re trying to predict.

---

In [None]:
# Pad sequences to fixed length (max length here is 3)
max_length = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_length, padding='pre')  # Input sequences
y = to_categorical([seq[-1] for seq in sequences], num_classes=vocab_size)  # One-hot targets (next word)
X = X[:, :-1]  # Remove last word from input for prediction
y = y[:, :-1]  # Adjust target accordingly? Wait, no: target is the last word of each sequence

In [None]:
# Corrected: X is all but last, y is last word
X = pad_sequences(sequences, maxlen=max_length, padding='pre')[:, :-1]  # Input: sequence without last
y = to_categorical([seq[-1] for seq in sequences], num_classes=vocab_size)  # Target: last word

In [None]:
print(f"X Shape: {X.shape}, y Shape: {y.shape}")
print("Sample X (sequence):", X[0])
print("Sample y (one-hot):", y[0])

X Shape: (8, 2), y Shape: (8, 12)
Sample X (sequence): [0 2]
Sample y (one-hot): [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
# Step 2: Build Simple LSTM Network
embedding_dim = 300  # Like Word2Vec dimension
lstm_units = 128     # Hidden/cell state size

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length-1))
model.add(LSTM(units=lstm_units, return_sequences=False))  # Single LSTM layer, no return_sequences for final hidden state
model.add(Dense(vocab_size, activation='softmax'))  # Output: probability over vocabulary



In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length-1))
model.summary()

In [None]:
# Step 3: Train the Model
model.fit(X, y, epochs=100, batch_size=1, verbose=1)  # Small data, so many epochs

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.0000e+00 - loss: 2.4903
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8664 - loss: 2.3961
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9722 - loss: 2.3248
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9722 - loss: 2.2389
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9722 - loss: 2.1249
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9722 - loss: 1.9674
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9722 - loss: 1.7487
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9722 - loss: 1.4558 
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7fac93ddd2e0>

In [None]:
# Step 4: Autocomplete Function (Generate Next Word)
def autocomplete(seed_text, tokenizer, model, max_length, num_words=1):
    for _ in range(num_words):
        # Tokenize seed
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # Pad
        token_list = pad_sequences([token_list], maxlen=max_length-1, padding='pre')
        # Predict
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)
        # Find word
        word = ''
        for key, value in tokenizer.word_index.items():
            if value == predicted_word_index:
                word = key
                break
        # Append and update seed
        seed_text += " " + word
    return seed_text

In [None]:
# Example Usage
seed = "Wang loves"
predicted = autocomplete(seed, tokenizer, model, max_length)
print(f"Autocomplete for '{seed}': {predicted}")

Autocomplete for 'Wang loves': Wang loves noodle
