In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
import pandas as pd
df = pd.read_csv("/content/Earth.txt",sep=',')
df

Unnamed: 0,Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world,the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained in its global ocean,covering 70.8% of Earth's crust. The remaining 29.2% of Earth's crust is land,most of which is located in the form of continental landmasses within one hemisphere,Earth's land hemisphere. Most of Earth's land is somewhat humid and covered by vegetation,while large sheets of ice at Earth's polar deserts retain more water than Earth's groundwater,lakes,rivers and atmospheric water combined. Earth's crust consists of slowly moving tectonic plates,which interact to produce mountain ranges,volcanoes,and earthquakes. Earth has a liquid outer core that generates a magnetosphere capable of deflecting most of the destructive solar winds and cosmic radiation.


In [None]:
#Generating some example sequential data
sentences = ['I love learning','I love python','I hate school','Recurrent Nueral Networks are powerful']

In [5]:
#Tokenizing the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df)
total_words = len(tokenizer.word_index) + 1
print(total_words)

96


In [8]:
# Creating input sequences and their corresponding next words
input_sequences = []
for sentence in df:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        n_gram_sequence = tokenized_sentence[:i+1]
        input_sequences.append(n_gram_sequence)
input_sequences

[[7, 4],
 [7, 4, 2],
 [7, 4, 2, 21],
 [7, 4, 2, 21, 22],
 [7, 4, 2, 21, 22, 23],
 [7, 4, 2, 21, 22, 23, 2],
 [7, 4, 2, 21, 22, 23, 2, 24],
 [7, 4, 2, 21, 22, 23, 2, 24, 5],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29, 30],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29, 30, 4],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29, 30, 4, 31],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29, 30, 4, 31, 15],
 [7,
  4,
  2,
  21,
  22,
  23,
  2,
  24,
  5,
  2,
  13,
  25,
  26,
  27,
  14,
  28,
  29,
  30,
  4,
  31,
  15,
  7

In [9]:
# Padding sequences for consistent input size
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

In [10]:
input_sequences

array([[ 0,  0,  0, ...,  0,  7,  4],
       [ 0,  0,  0, ...,  7,  4,  2],
       [ 0,  0,  0, ...,  4,  2, 21],
       ...,
       [ 0,  0,  0, ..., 17, 93,  5],
       [ 0,  0,  0, ..., 93,  5, 94],
       [ 0,  0,  5, ...,  5, 94, 95]], dtype=int32)

In [11]:
# Creating input and output data
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [12]:
# Building a simple RNN model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=50, input_length=max_sequence_length-1))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.add(Dense(total_words, activation='softmax'))

In [13]:
#compiling the model
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [14]:
#Training the model
model.fit(X,y,epochs=50,verbose=2)

Epoch 1/50
5/5 - 2s - loss: 4.5795 - accuracy: 0.0000e+00 - 2s/epoch - 488ms/step
Epoch 2/50
5/5 - 0s - loss: 4.3890 - accuracy: 0.0929 - 85ms/epoch - 17ms/step
Epoch 3/50
5/5 - 0s - loss: 4.2105 - accuracy: 0.1071 - 96ms/epoch - 19ms/step
Epoch 4/50
5/5 - 0s - loss: 4.1017 - accuracy: 0.1357 - 81ms/epoch - 16ms/step
Epoch 5/50
5/5 - 0s - loss: 3.9529 - accuracy: 0.1714 - 84ms/epoch - 17ms/step
Epoch 6/50
5/5 - 0s - loss: 3.8561 - accuracy: 0.2214 - 82ms/epoch - 16ms/step
Epoch 7/50
5/5 - 0s - loss: 3.7083 - accuracy: 0.2714 - 88ms/epoch - 18ms/step
Epoch 8/50
5/5 - 0s - loss: 3.5830 - accuracy: 0.3071 - 87ms/epoch - 17ms/step
Epoch 9/50
5/5 - 0s - loss: 3.4486 - accuracy: 0.3214 - 87ms/epoch - 17ms/step
Epoch 10/50
5/5 - 0s - loss: 3.3200 - accuracy: 0.3929 - 88ms/epoch - 18ms/step
Epoch 11/50
5/5 - 0s - loss: 3.2094 - accuracy: 0.3786 - 82ms/epoch - 16ms/step
Epoch 12/50
5/5 - 0s - loss: 3.0303 - accuracy: 0.4357 - 86ms/epoch - 17ms/step
Epoch 13/50
5/5 - 0s - loss: 2.9192 - accuracy

<keras.src.callbacks.History at 0x7d01284471f0>

In [16]:
# Generating text using the trained model
seed_text = input("Enter the starting word: ")
next_words = int(input("Enter how many words to predict: "))

for _ in range(next_words):
    tokenized_seed = tokenizer.texts_to_sequences([seed_text])[0]
    tokenized_seed = pad_sequences([tokenized_seed], maxlen=max_sequence_length-1, padding='pre')
    predicted_word_index = np.argmax(model.predict(tokenized_seed), axis=-1)
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    seed_text += " " + predicted_word

print(seed_text)

Enter the starting word: lavs
Enter how many words to predict: 12
lavs is is in the the the the the the liquid the crust
