In [None]:
# Step 1: Install and Import Libraries
!pip install tensorflow datasets numpy

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import categorical_accuracy
import pickle
from datasets import load_dataset

# Step 2: Load the WikiText Dataset
dataset = load_dataset('wikitext', 'wikitext-103-raw-v1')

# Step 3: Limit the dataset size
# You can limit the dataset by taking the first N lines or characters
# Example: Use only the first 1000 lines of text from the dataset
data = ' '.join(dataset['train']['text'][:1000])  # Limit to first 1000 lines

# Optional: Or limit the length of the text (e.g., 5000 characters)
# data = data[:5000]

# Step 4: Preprocess the Data
# Clean the data by removing unwanted characters (e.g., newline, Unicode, etc.)
data = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ')

# Step 5: Tokenize the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# Save the tokenizer for later use in predictions
pickle.dump(tokenizer, open('token.pkl', 'wb'))

# Print the size of the vocabulary (number of unique words)
vocab_size = len(tokenizer.word_index) + 1  # +1 to account for padding
print(f"Vocabulary size: {vocab_size}")

# Step 6: Prepare Sequences of Words
sequence_data = tokenizer.texts_to_sequences([data])[0]  # Convert text to sequence of tokens
sequences = []

# Loop through the data to create sequences of 4 tokens (3 for X and 1 for y)
for i in range(3, len(sequence_data)):
    seq = sequence_data[i-3:i+1]  # Create a sequence of 4 words
    sequences.append(seq)

# Convert list of sequences into a NumPy array
sequences = np.array(sequences)

# Split sequences into input (X) and output (y)
X = sequences[:, 0:3]  # First 3 words as input
y = sequences[:, 3]    # 4th word as the output (next word)

# One-hot encode the output labels (y) into a vector format
y = to_categorical(y, num_classes=vocab_size)

# Step 7: Build the LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=3))  # Use 50 dimensions for the embeddings
model.add(LSTM(100, return_sequences=False))  # 100 units in LSTM
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

# Step 8: Compile the Model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=[categorical_accuracy])

# Step 9: Set Up Model Checkpoint (Save Best Model)
checkpoint = ModelCheckpoint("next_word.h5", monitor='loss', verbose=1, save_best_only=True)

# Step 10: Train the Model
# Train the model for 20 epochs with a batch size of 64.
history = model.fit(X, y, epochs=10, batch_size=64, callbacks=[checkpoint])

# Step 11: Evaluate the Model
# Evaluate the model to get the accuracy on the training data
loss, accuracy = model.evaluate(X, y, verbose=1)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 12: Predict the Next Word
def predict_next_word(model, tokenizer, text):
    # Convert the input text to a sequence of integers using the tokenizer
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)  # Convert to numpy array

    # Predict the next word (returns the index of the predicted word)
    pred_index = np.argmax(model.predict(sequence), axis=-1)

    # Reverse the word index to get the actual word
    for word, index in tokenizer.word_index.items():
        if index == pred_index:
            return word

# Example: Predict the next word after the input "the cat sat"
text = 'the cat sat'
predicted_word = predict_next_word(model, tokenizer, text)
print(f"The predicted next word is: {predicted_word}")

# Step 13: Interactive Prediction
while True:
    text = input("Enter your line (or type '0' to exit): ")

    if text == "0":
        print("Execution completed....")
        break
    else:
        try:
            text = text.split(" ")
            text = text[-3:]  # Use only the last 3 words
            predicted_word = predict_next_word(model, tokenizer, text)
            print(f"The predicted next word is: {predicted_word}")
        except Exception as e:
            print("Error occurred:", e)
            continue


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Vocabulary size: 7641
Epoch 1/10
Epoch 1: loss improved from inf to 7.36040, saving model to next_word.h5
Epoch 2/10
  3/722 [..............................] - ETA: 34s - loss: 6.7815 - categorical_accuracy: 0.0625

  saving_api.save_model(


Epoch 2: loss improved from 7.36040 to 6.91552, saving model to next_word.h5
Epoch 3/10
Epoch 3: loss improved from 6.91552 to 6.69008, saving model to next_word.h5
Epoch 4/10
Epoch 4: loss improved from 6.69008 to 6.37472, saving model to next_word.h5
Epoch 5/10
Epoch 5: loss improved from 6.37472 to 6.11167, saving model to next_word.h5
Epoch 6/10
Epoch 6: loss improved from 6.11167 to 5.89449, saving model to next_word.h5
Epoch 7/10
Epoch 7: loss improved from 5.89449 to 5.68574, saving model to next_word.h5
Epoch 8/10
Epoch 8: loss improved from 5.68574 to 5.47865, saving model to next_word.h5
Epoch 9/10
Epoch 9: loss improved from 5.47865 to 5.25309, saving model to next_word.h5
Epoch 10/10
Epoch 10: loss improved from 5.25309 to 5.00804, saving model to next_word.h5
Model Accuracy: 20.59%
The predicted next word is: of
Enter your line (or type '0' to exit): King was
The predicted next word is: not
Enter your line (or type '0' to exit): you were
The predicted next word is: a
