In [None]:
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
# Read the text file
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

corpus = text.split('\n')

In [None]:
# Data Generator class
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, texts, vectorizer, seq_length=5, batch_size=32, shuffle=True):
        self.texts = texts
        self.vectorizer = vectorizer
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.texts) / self.batch_size))

    def __getitem__(self, index):
        batch_texts = self.texts[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__data_generation(batch_texts)
        return X, y

    def on_epoch_end(self):
        self.indices = np.arange(len(self.texts))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_texts):
        X, y = [], []
        for text in batch_texts:
            encoded = self.vectorizer(text)
            for i in range(1, len(encoded)):
                X.append(encoded[:i])
                y.append(encoded[i])

        X = pad_sequences(X, maxlen=self.seq_length, padding='pre')
        y = np.array(y)
        return np.array(X), y

# Function to encode text using TensorFlow TextVectorization
def encode_text_tf(text, vectorizer):
    return vectorizer([text]).numpy()[0]

# Function to decode text using TensorFlow TextVectorization
def decode_text_tf(indices, index_word):
    return ' '.join([index_word[idx] for idx in indices if idx != 0])

# Function to encode text using CountVectorizer
def encode_text_cv(text, vectorizer):
    return vectorizer.transform([text]).toarray()[0]

# Function to decode text using CountVectorizer
def decode_text_cv(indices, index_word):
    return ' '.join([index_word[idx] for idx in indices if idx in index_word])



In [None]:
# Build CountVectorizer
count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus)
vocab_size_cv = len(count_vectorizer.vocabulary_)
word_index_cv = {v: k for k, v in count_vectorizer.vocabulary_.items()}



In [None]:
# Build TensorFlow TextVectorization
seq_length = 5
vectorizer_tf = TextVectorization(max_tokens=20000, output_sequence_length=seq_length)
vectorizer_tf.adapt(corpus)
vocab_size_tf = len(vectorizer_tf.get_vocabulary())
index_word_tf = {idx: word for idx, word in enumerate(vectorizer_tf.get_vocabulary())}

In [None]:
# Create data generators for both vectorizers
train_gen_cv = DataGenerator(corpus, lambda text: count_vectorizer.transform([text]).toarray()[0], seq_length=seq_length)
train_gen_tf = DataGenerator(corpus, vectorizer_tf)

# Define model
def build_lstm_model(vocab_size, embedding_dim=128, seq_length=5):
    inputs = Input(shape=(seq_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(inputs)
    x = LSTM(128)(x)
    outputs = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model

# Compile the models
model_cv = build_lstm_model(vocab_size_cv)
model_cv.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_tf = build_lstm_model(vocab_size_tf)
model_tf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [None]:
# Train using the CountVectorizer data generator
model_cv.fit(train_gen_cv, epochs=5)

In [None]:
# Train using the TensorFlow TextVectorization data generator
model_tf.fit(train_gen_tf, epochs=5)

In [None]:
# Predict next word function
def predict_next_word(model, input_text, vectorizer, decode_fn, seq_length):
    input_seq = pad_sequences([vectorizer(input_text)], maxlen=seq_length, padding='pre')
    prediction = model.predict(input_seq)
    predicted_word_idx = np.argmax(prediction, axis=-1)[0]
    return decode_fn([predicted_word_idx], index_word_cv if vectorizer == count_vectorizer else index_word_tf)

# Example usage for next word prediction
test_text = "the quick brown"
predicted_word_cv = predict_next_word(model_cv, test_text, encode_text_cv, decode_text_cv, seq_length)
predicted_word_tf = predict_next_word(model_tf, test_text, encode_text_tf, decode_text_tf, seq_length)

print(f"Predicted next word (CountVectorizer): {predicted_word_cv}")
print(f"Predicted next word (TensorFlow TextVectorizer): {predicted_word_tf}")