In [2]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


In [3]:
data = "Planetary science is the study of the assemblage of planets, moons, dwarf planets, comets, asteroids, and other bodies orbiting the Sun, as well as extrasolar planets. The Solar System has been relatively well-studied, initially through telescopes and then later by spacecraft. This has provided a good overall understanding of the formation and evolution of the Sun's planetary system, although many new discoveries are still being made."

#### Tokenize the data

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#### Sequence the data

In [5]:
sequence = tokenizer.texts_to_sequences([data])[0]
sequence[:10]

[5, 10, 11, 1, 12, 2, 1, 13, 2, 3]

#### Token index and number of tokens

In [6]:
tokens = tokenizer.word_index
tokens

{'a': 36,
 'although': 43,
 'and': 4,
 'are': 47,
 'as': 6,
 'assemblage': 13,
 'asteroids': 17,
 'been': 24,
 'being': 49,
 'bodies': 19,
 'by': 32,
 'comets': 16,
 'discoveries': 46,
 'dwarf': 15,
 'evolution': 41,
 'extrasolar': 22,
 'formation': 40,
 'good': 37,
 'has': 9,
 'initially': 27,
 'is': 11,
 'later': 31,
 'made': 50,
 'many': 44,
 'moons': 14,
 'new': 45,
 'of': 2,
 'orbiting': 20,
 'other': 18,
 'overall': 38,
 'planetary': 5,
 'planets': 3,
 'provided': 35,
 'relatively': 25,
 'science': 10,
 'solar': 23,
 'spacecraft': 33,
 'still': 48,
 'studied': 26,
 'study': 12,
 'sun': 21,
 "sun's": 42,
 'system': 8,
 'telescopes': 29,
 'the': 1,
 'then': 30,
 'this': 34,
 'through': 28,
 'understanding': 39,
 'well': 7}

In [7]:
#printing bi-grams from the sequences

sequences = []
for i in range(1,len(sequence)):
    seq = sequence[i-1:i+1]
    sequences.append(seq)
sequences = np.array(sequences)

In [8]:
train_x, train_y = sequences[:,0], sequences[:,1]
train_y = to_categorical(train_y, num_classes=len(tokens)+1)

#### LSTM 

In [9]:
tokens = len(tokenizer.word_index) + 1
model=Sequential()
model.add(Embedding(tokens,10,input_length=1))
model.add(LSTM(32))
model.add(Dense(tokens,activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
model.fit(train_x, train_y, epochs=1000, verbose=0)

<keras.callbacks.History at 0x1bbceeec518>

#### Predict the next 6 words given a word

In [21]:
def generate_text(input_text):
    result = input_text
    
    for i in range(6):
        seq = np.array(tokenizer.texts_to_sequences([input_text])[0])
        pred_seq = model.predict_classes(seq)
        
        for word, index in tokenizer.word_index.items():
            if index==pred_seq:
                out_word = word
                break
        input_text = out_word
        result = result + " " + out_word
        
    return result


In [24]:
generate_text("been")

'been relatively well studied initially through telescopes'

In [23]:
#print original data
print(data)

Planetary science is the study of the assemblage of planets, moons, dwarf planets, comets, asteroids, and other bodies orbiting the Sun, as well as extrasolar planets. The Solar System has been relatively well-studied, initially through telescopes and then later by spacecraft. This has provided a good overall understanding of the formation and evolution of the Sun's planetary system, although many new discoveries are still being made.
