In [None]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import spacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('punkt_tab')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
corpus = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence \
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data. \
Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation.

Natural language processing has a history that goes back to the 1950s. Early NLP systems were based on complex sets of hand-written rules. \
Starting in the late 1980s, however, there was a revolution in NLP with the introduction of machine learning algorithms for language processing. \
This was due to both the steady increase in computational power (see Moore's law) and the gradual availability of electronic text corpora, \
e.g. the Brown Corpus and the Canadian Hansard corpus.

Some of the earliest successful NLP systems, such as SHRDLU, worked on restricted toy worlds with a limited vocabulary and domain of discourse. \
It was a major challenge to scale these systems to handle real-world text. A breakthrough came in the 1980s with the development of statistical methods \
for NLP, which were able to learn from large amounts of data and generalize to new text.

In recent years, deep learning methods have achieved state-of-the-art results in many NLP tasks, including machine translation, sentiment analysis, and text summarization. \
These methods are based on artificial neural networks with multiple layers, which are able to learn complex patterns in data.
"""
tokens = word_tokenize(corpus)
lemmatized_tokens = [token.lemma_ for token in nlp(corpus)]
all_tokens = tokens + lemmatized_tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_tokens)
total_words = len(tokenizer.word_index) + 1
input_sequences = []
for line in all_tokens:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 5.1486
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step - accuracy: 0.1429 - loss: 5.1381
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.2857 - loss: 5.1275
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.5714 - loss: 5.1165
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.8571 - loss: 5.1050
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.7143 - loss: 5.0927
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - accuracy: 0.7143 - loss: 5.0795
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.7143 - loss: 5.0651
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7bd68dcb4fd0>