In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
import gc

# Colab-specific memory management
def reduce_memory_usage():
    gc.collect()
    tf.keras.backend.clear_session()

# Download and preprocess dataset
def download_dataset():
    print("Downloading dataset...")
    !wget -O dataset.txt https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt
    return 'dataset.txt'

# Read and preprocess text
def preprocess_text(file_path, max_words=10000):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
        text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Limit text to prevent memory issues
    words = text.split()[:max_words]
    return ' '.join(words)

# Memory-efficient sequence generation
def generate_sequences(tokenizer, text, max_sequence_len=20, step=3):
    token_list = tokenizer.texts_to_sequences([text])[0]
    sequences = []
    next_words = []

    for i in range(0, len(token_list) - max_sequence_len, step):
        sequence = token_list[i:i + max_sequence_len]
        sequences.append(sequence[:-1])
        next_words.append(sequence[-1])

    return sequences, next_words

# Colab-friendly model with reduced complexity
def create_model(vocab_size, max_sequence_len):
    model = Sequential([
        Embedding(vocab_size, 64, input_length=max_sequence_len-1),
        LSTM(128, return_sequences=False),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(vocab_size, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Main training function
def train_next_word_predictor():
    # Reduce memory usage at start
    reduce_memory_usage()

    # Download and preprocess dataset
    dataset_path = download_dataset()
    text = preprocess_text(dataset_path)

    # Tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    vocab_size = len(tokenizer.word_index) + 1

    # Sequence generation
    max_sequence_len = 20
    X_seq, y_seq = generate_sequences(tokenizer, text, max_sequence_len)

    # Convert to numpy arrays
    X = np.array(X_seq)
    y = np.array(y_seq)

    # Create model
    model = create_model(vocab_size, max_sequence_len)

    # Checkpoint to save best model
    checkpoint_path = "/content/best_model.keras"
    checkpoint = ModelCheckpoint(
        checkpoint_path,
        monitor='loss',
        save_best_only=True,
        verbose=1
    )

    # Training with memory-efficient approach
    try:
        history = model.fit(
            X, y,
            epochs=10,
            batch_size=64,
            validation_split=0.2,
            callbacks=[checkpoint],
            verbose=1
        )

        # Save final model
        model.save("/content/final_next_word_model.keras")

        return model, tokenizer, vocab_size

    except Exception as e:
        print(f"Training error: {e}")
        return None, None, None
    finally:
        reduce_memory_usage()

# Prediction function
def predict_next_word(seed_text, model, tokenizer, vocab_size, max_sequence_len=20):
    # Tokenize and pad input sequence
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Predict next word
    predicted = model.predict(token_list)
    predicted_word_index = np.argmax(predicted, axis=-1)[0]

    # Convert index back to word
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word

    return "Unable to predict"

# Main execution
def main():
    # Train the model
    print("Starting model training...")
    model, tokenizer, vocab_size = train_next_word_predictor()

    if model and tokenizer:
        # Test prediction
        test_seeds = [
            "the quick brown",
            "machine learning is",
            "artificial intelligence will"
        ]

        print("\nPrediction Examples:")
        for seed in test_seeds:
            predicted_word = predict_next_word(seed, model, tokenizer, vocab_size)
            print(f"Seed: {seed} | Predicted Next Word: {predicted_word}")

# Run the main function
if __name__ == "__main__":
    main()

Starting model training...
Downloading dataset...
--2024-12-15 14:10:06--  https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6488666 (6.2M) [text/plain]
Saving to: ‘dataset.txt’


2024-12-15 14:10:06 (227 MB/s) - ‘dataset.txt’ saved [6488666/6488666]

Epoch 1/10
[1m41/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 40ms/step - accuracy: 0.0379 - loss: 7.4988
Epoch 1: loss improved from inf to 7.08158, saving model to /content/best_model.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - accuracy: 0.0381 - loss: 7.4794 - val_accuracy: 0.0240 - val_loss: 6.3911
Epoch 2/10
[1m41/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 40ms/step -