In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import os

In [2]:
file = open("/home/vijay/train.txt", "r", encoding = "utf8")
lines = []
for i in file:
    lines.append(i)

In [3]:
data = ""
for i in lines:
  data = ' '. join(lines) 

In [4]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')

In [5]:
data = data.split()
data = ' '.join(data)
data

"Technical writing is a product-focused job. A technical writer's main role is to deliver complex information, instructions, ideas and functions into comprehensible words for the audience. Usually, that involves a deep understanding of a certain product and being able to explain it easily for users to use it. On the other hand, technical blogging is more audience-focused. A technical blogger is a content writer that focuses on delivering high-quality articles that engages their audience and brings traffic to their business' website. They often write about topics that their target audience are interested about and provide value from their background technical knowledge or with a little research. Because technical writing is product-focused, it mostly involves a specific manual or instruction on how to use the product. The content they write must be clear, concise, objective and rigid. The style and tone is neutral, formal and professional. Any images you see in technical writings will b

In [6]:
len(data)

9169

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
# print(tokenizer)

pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[15, 46, 8, 3, 29, 79, 223, 3, 15, 224, 225, 226, 8, 2, 122]

In [8]:
len(sequence_data)

1525

In [9]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

491


In [10]:
import numpy as np

In [11]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  1522


array([[ 15,  46,   8,   3],
       [ 46,   8,   3,  29],
       [  8,   3,  29,  79],
       [  3,  29,  79, 223],
       [ 29,  79, 223,   3],
       [ 79, 223,   3,  15],
       [223,   3,  15, 224],
       [  3,  15, 224, 225],
       [ 15, 224, 225, 226],
       [224, 225, 226,   8]])

In [12]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [13]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 15  46   8]
 [ 46   8   3]
 [  8   3  29]
 [  3  29  79]
 [ 29  79 223]
 [ 79 223   3]
 [223   3  15]
 [  3  15 224]
 [ 15 224 225]
 [224 225 226]]
Response:  [  3  29  79 223   3  15 224 225 226   8]


In [14]:
y = to_categorical(y, num_classes=vocab_size)
# y[:5]

In [15]:
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1, 10)             4910      
                                                                 
 lstm_2 (LSTM)               (None, 1, 1000)           4044000   
                                                                 
 lstm_3 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense_2 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_3 (Dense)             (None, 491)               491491    
                                                                 
Total params: 13,545,401
Trainable params: 13,545,401
Non-trainable params: 0
_________________________________________________________________


In [18]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=20, batch_size=64, callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: loss improved from inf to 5.92417, saving model to next_words.h5
Epoch 2/20
Epoch 00002: loss improved from 5.92417 to 5.54302, saving model to next_words.h5
Epoch 3/20
Epoch 00003: loss improved from 5.54302 to 5.45795, saving model to next_words.h5
Epoch 4/20
Epoch 00004: loss improved from 5.45795 to 5.40076, saving model to next_words.h5
Epoch 5/20
Epoch 00005: loss did not improve from 5.40076
Epoch 6/20
Epoch 00006: loss improved from 5.40076 to 5.29560, saving model to next_words.h5
Epoch 7/20
Epoch 00007: loss improved from 5.29560 to 5.22479, saving model to next_words.h5
Epoch 8/20
Epoch 00008: loss improved from 5.22479 to 5.14931, saving model to next_words.h5
Epoch 9/20
Epoch 00009: loss improved from 5.14931 to 5.09144, saving model to next_words.h5
Epoch 10/20
Epoch 00010: loss improved from 5.09144 to 5.04316, saving model to next_words.h5
Epoch 11/20
Epoch 00011: loss improved from 5.04316 to 4.98845, saving model to next_words.h5
Epoch 12/20
Ep

<keras.callbacks.History at 0x7f124c05fdf0>

In [19]:
from tensorflow.keras.models import load_model
import numpy as np

model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [20]:
while(True):
  text = input("Enter your line: ")
  
  if text == "":
      print("Execution stops")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line: 
Execution stops
