In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('punkt_tab')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
corpus = "One disadvantage of using 'Best Of' samping is that it may lead to limited exploration of the model's \
knowledge and creativity. By focusing on the most probable next words, the model might generate responses that are \
safe and conventional, potentially missing out on more diverse and innovative outputs. The lack of exploration could \
result in repetitive or less imaginative responses, especially in situations where novel and unconventional ideas are \
desired.To address this limitation, other sampling strategies like temperature-based sampling or top-p (nucleus) sampling \
can be employed to introduce more randomness and encourage the model to explore a broader range of possibilities. \
However, it's essential to carefully balance exploration and exploitation based on the specific requirements of the task or \
application."
sentences = sent_tokenize(corpus)
lemmatized_sentences = [" ".join([token.lemma_ for token in nlp(sentence)]) for sentence in sentences]
processed_corpus = sentences + lemmatized_sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_corpus)
total_words = len(tokenizer.word_index) + 1
input_sequences = []
for line in processed_corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.0305 - loss: 4.6973
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0337 - loss: 4.6648
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0436 - loss: 4.5500
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0540 - loss: 4.3985
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0786 - loss: 4.3763
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0712 - loss: 4.2069
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0954 - loss: 4.1551
Epoch 8/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0929 - loss: 4.0540
Epoch 9/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x7af42a06b790>