**Model Training**

In [None]:
!rm -rf '/content/sample_data'

In [None]:
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import clone_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Step<br>
Read "Metmorphosis" Dataset

In [None]:
file=open("metamorphosis_clean.txt",mode='rt',encoding="utf8")

In [None]:
line=[]
for i in file:
  line.append(i)
data=""
for i in line:
  data=' '.join(line) 
data=data.replace('\n','').replace('\r','').replace('\ufeff','')

Step:<br>
1.Split dataset into list of words <br>
2.All words in lower case

In [None]:
z=word_tokenize(data)
z=[word.lower() for word in z if word.isalpha()]
z[:5]

['one', 'morning', 'when', 'gregor', 'samsa']

**Dataset model_1**

Step:<br>
1. Divide dataset into sets of 5 words
2. 5 words consist of 4 training words + 1 target word

In [None]:
train_size=5
pred_size=1
text_to_train=[]
for i in range(train_size,len(z)):
  word_set=z[i-train_size:i]
  text_to_train.append(word_set)
print(text_to_train[:3])

[['one', 'morning', 'when', 'gregor', 'samsa'], ['morning', 'when', 'gregor', 'samsa', 'woke'], ['when', 'gregor', 'samsa', 'woke', 'from']]


Step:<br>
1. Use tokenizer to convert text into sequences
2. Separate training words and target word
3. One-hot encode target word based on dictionary

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(text_to_train)
text_sequence=tokenizer.texts_to_sequences(text_to_train)
print(text_sequence[:3])

[[63, 144, 54, 12, 93], [144, 54, 12, 93, 899], [54, 12, 93, 899, 29]]


In [None]:
vocab_size=len(tokenizer.word_index)+1

text_sequence=np.array(text_sequence)
data=text_sequence[:,:-pred_size]
target_data=text_sequence[:,-pred_size:]
print(text_sequence[:3])
print(data[:3,:])
print(target_data[:3,:])

[[ 63 144  54  12  93]
 [144  54  12  93 899]
 [ 54  12  93 899  29]]
[[ 63 144  54  12]
 [144  54  12  93]
 [ 54  12  93 899]]
[[ 93]
 [899]
 [ 29]]


Dictionary (Total vocabulary)

In [None]:
my_vocab=dict([value,key] for key,value in tokenizer.word_index.items())
print(my_vocab)



In [None]:
from keras.utils.np_utils import to_categorical
target_data=to_categorical(target_data,vocab_size)
print(target_data[:3,:])
print(target_data.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(21927, 2541)


*Google's Word2Vec*

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# # Google's Word2Vec
# word2vec = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin', binary=True)

*Stanford's Word2Vec: GloVe*

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip '/content/glove.6B.zip' 

--2021-06-11 08:11:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-06-11 08:11:53--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-06-11 08:11:54--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
# store GloVe in word2vec format
from gensim.scripts.glove2word2vec import glove2word2vec
#using 50d embeddings
glove_input_file = 'glove.6B.50d.txt'
word2vec_output_file = 'glove.6B.50d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 50)

In [None]:
from gensim.models import KeyedVectors
filename = 'glove.6B.50d.txt.word2vec'
word2vec = KeyedVectors.load_word2vec_format(filename, binary=False) #binary tells about existing format

In [None]:
n_words = len(tokenizer.word_index)+1
emb_dim=50 # based on glove dimension
emb_matrix = np.zeros((n_words,emb_dim))
emb_matrix.shape

(2541, 50)

In [None]:
for word,i in tokenizer.word_index.items():
  if word in word2vec.vocab:
    emb_matrix[i] = word2vec.word_vec(word)
emb_matrix.shape

(2541, 50)

In [None]:
# create training and testing dataset
val_split=0.2
perm = np.random.permutation(len(data))
idx_train=perm[:int((len(data))*(1-val_split))]
idx_val=perm[int((len(data))*(1-val_split)):]

train_words = data[idx_train]
train_target = target_data[idx_train]
val_set = data[idx_val]
val_target = target_data[idx_val]
print(train_words.shape, train_target.shape)
print(val_set.shape,val_target.shape)

(17541, 4) (17541, 2541)
(4386, 4) (4386, 2541)


Step:<br>
1. Create model architecture in TensorFlow using Keras
2. Compile the model
3. Make necessary callback functions
4. Train model_1


In [None]:
model_1=Sequential()
model_1.add(Embedding(vocab_size,35, input_length=train_size-pred_size))
model_1.add(Bidirectional(LSTM(1000,return_sequences=True)))
model_1.add(Bidirectional(LSTM(1000)))
model_1.add(Dense(1000,activation="relu"))
model_1.add(Dense(vocab_size,activation="softmax"))

model_1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 35)             88935     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 4, 2000)           8288000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 2000)              24008000  
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              2001000   
_________________________________________________________________
dense_3 (Dense)              (None, 2541)              2543541   
Total params: 36,929,476
Trainable params: 36,929,476
Non-trainable params: 0
_________________________________________________________________


In [None]:
# checkpoint=ModelCheckpoint("nextword_model_1.h5",monitor="loss",verbose=1,save_best_only=True,mode="auto")
# reduce=ReduceLROnPlateau(monitor="loss",factor=0.2,patience=3,min_lr=0.0001,verbose=1)
# earlystop=EarlyStopping(monitor="loss",patience=3,mode="min",verbose=1)

In [None]:
model_1.compile(loss="categorical_crossentropy",optimizer=Adam(learning_rate=0.001),metrics=['accuracy'])
history_1 = model_1.fit(train_words,train_target,epochs=25,batch_size=64) #,callbacks=[checkpoint,reduce,earlystop])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


Step:<br>
1. Fetch the predictions of model_1 on train_words to create dataset for model_2

**BOOSTING**

In [None]:
pred_1 = model_1.predict(train_words)
fetch_pred_1=[]
for i in range(pred_1.shape[0]):
  word_i=np.argmax(pred_1[i])
  fetch_pred_1.append(word_i) 

**Dataset model_2**

In [None]:
# replacing target words with predictions of model_1
train_target = to_categorical(fetch_pred_1,vocab_size)
print(train_target[1])
print(train_target.shape)

[0. 0. 0. ... 0. 0. 0.]
(17541, 2541)


Step:<br>
1. Re-initialize the callbacks for model_2
2. Clone model_1 to get model_2 as they have same architecture
3. Compile and train model_2

In [None]:
# checkpoint=ModelCheckpoint("nextword_model_2.h5",monitor="loss",verbose=1,save_best_only=True,mode="auto")
# reduce=ReduceLROnPlateau(monitor="loss",factor=0.2,patience=3,min_lr=0.0001,verbose=1)
# earlystop=EarlyStopping(monitor="loss",patience=3,mode="min",verbose=1)

In [None]:
model_2=clone_model(model_1)
model_2.compile(loss="categorical_crossentropy",optimizer=Adam(lr=0.001),metrics=['accuracy'])
history_2=model_2.fit(train_words,train_target,epochs=25,batch_size=64) #callbacks=[checkpoint,reduce,earlystop])

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


**???** *It can be seen that now the training accuracy has reduced than model_1 which is expected because now we have not used the original dataset for training but a dataset we created using predictions of model_1.*

Step:<br>
1. Get predictions of model_2 on train_words to create model_3 dataset

In [None]:
pred_2 = model_2.predict(train_words)
fetch_pred_2=[]
for i in range(pred_2.shape[0]):
  word_i=np.argmax(pred_2[i])
  fetch_pred_2.append(word_i) 

**Dataset model_3**

In [None]:
# replacing target words with predictions of model_1
train_target = to_categorical(fetch_pred_2,vocab_size)
print(train_target[1])
print(train_target.shape)

[0. 0. 0. ... 0. 0. 0.]
(17541, 2541)


Step:<br>
1. Re-initialize call backs for model_3
2. Clone model_1 to get model_3
3. Compile and train model_3

In [None]:
# checkpoint=ModelCheckpoint("nextword_model_3.h5",monitor="loss",verbose=1,save_best_only=True,mode="auto")
# reduce=ReduceLROnPlateau(monitor="loss",factor=0.2,patience=3,min_lr=0.0001,verbose=1)
# earlystop=EarlyStopping(monitor="loss",patience=3,mode="min",verbose=1)

In [None]:
model_3=clone_model(model_1)
model_3.compile(loss="categorical_crossentropy",optimizer=Adam(lr=0.001),metrics=['accuracy'])
history_3=model_3.fit(train_words,train_target,epochs=25,batch_size=64) #callbacks=[checkpoint,reduce,earlystop])

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


**Prediction**

Step:<br>
1. The input words are padded before running the model because may have a different size than 4

**Multiple Words Prediction**

In [None]:
val_data = []
for w in val_set:
  val_data.extend(w)
print(len(val_data))
print(val_data[:10])

17544
[39, 1295, 85, 52, 21, 41, 4, 28, 374, 686]


In [None]:
size=4
pred=3
val_words=[]
target_val=[]
for i in range(size,len(val_data)):
  w = val_data[i-size:i]
  t = val_data[i:i+pred]
  val_words.append(w)
  target_val.append(t)
print(len(val_words), len(target_val))
print(val_words[:2])
print(target_val[:2])

17540 17540
[[39, 1295, 85, 52], [1295, 85, 52, 21]]
[[21, 41, 4], [41, 4, 28]]


In [None]:
X=np.asarray(val_words)
y=np.asarray(target_val)
print(X.shape, y.shape)

(17540, 4) (17540,)


  return array(a, dtype, copy=False, order=order)


In [None]:
# first word prediction stored in pred_1
pred = model_1.predict(X)
pred_1 = []
for i in range(pred.shape[0]):
  pred_1.append(np.argmax(pred[i]))

In [None]:
for i in range(len(val_words)):
  val_words[i].append(pred_1[i])
X = np.asarray(val_words)
X = pad_sequences(X, maxlen=4)
X.shape

(17540, 4)

In [None]:
# second word prediction in pred_2
pred = model_2.predict(X)
pred_2 = []
for i in range(pred.shape[0]):
  pred_2.append(np.argmax(pred[i]))

In [None]:
for i in range(len(val_words)):
  val_words[i].append(pred_2[i])
X = np.asarray(val_words)
X = pad_sequences(X, maxlen=4)
X.shape

(17540, 4)

In [None]:
# third word prediction in pred_3
pred = model_3.predict(X)
pred_3 = []
for i in range(pred.shape[0]):
  pred_3.append(np.argmax(pred[i]))

**BAGGING**<br>
Trained model_1, model_2, model_3 on same dataset without replacement<br>
**Prediction**

In [None]:
val_data = []
for w in val_set:
  val_data.extend(w)
print(len(val_data))
print(val_data[:10])

17544
[39, 1295, 85, 52, 21, 41, 4, 28, 374, 686]


In [None]:
size=4
pred=3
val_words=[]
target_val=[]
for i in range(size,len(val_data)):
  w = val_data[i-size:i]
  t = val_data[i:i+pred]
  val_words.append(w)
  target_val.append(t)
print(len(val_words), len(target_val))
print(val_words[:2])
print(target_val[:2])

17540 17540
[[39, 1295, 85, 52], [1295, 85, 52, 21]]
[[21, 41, 4], [41, 4, 28]]


In [None]:
X=np.asarray(val_words)
y=np.asarray(target_val)
print(X.shape, y.shape)

(17540, 4) (17540,)


  return array(a, dtype, copy=False, order=order)


In [None]:
# first word prediction stored in pred_1
pred = model_1.predict(X)
pred_1 = []
for i in range(pred.shape[0]):
  pred_1.append(np.argmax(pred[i]))

In [None]:
for i in range(len(val_words)):
  val_words[i].append(pred_1[i])
X = np.asarray(val_words)
X = pad_sequences(X, maxlen=4)
X.shape

(17540, 4)

In [None]:
# second word prediction in pred_2
pred = model_1.predict(X)
pred_2 = []
for i in range(pred.shape[0]):
  pred_2.append(np.argmax(pred[i]))

In [None]:
for i in range(len(val_words)):
  val_words[i].append(pred_2[i])
X = np.asarray(val_words)
X = pad_sequences(X, maxlen=4)
X.shape

(17540, 4)

In [None]:
# third word prediction in pred_3
pred = model_1.predict(X)
pred_3 = []
for i in range(pred.shape[0]):
  pred_3.append(np.argmax(pred[i]))

*Accuracy prediction*

In [None]:
# create validation predictions for error calculation
val_pred = []
for i in range(len(pred_1)):
  val_pred.extend([pred_1[i],pred_2[i],pred_3[i]])
print(len(val_pred))

52620


In [None]:
val_target = []
for i in range(len(target_val)):
  val_target.extend(target_val[i])
val_pred = val_pred[:len(val_target)]
print(len(val_target))
print(val_target[:10])

52617
[21, 41, 4, 41, 4, 28, 4, 28, 374, 28]


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(val_target, val_pred)
print("Validation Accuracy : %0.2f"%(accuracy*100))

Validation Accuracy : 6.05


**Cosine Similarity**

In [None]:
cos = 0
for j in range(len(val_target)):
  for word,i in tokenizer.word_index.items():
    if val_target[j]==i:
      w1=word
      break
  for word,i in tokenizer.word_index.items():
    if val_pred[j]==i:
      w2=word
      break    
  try:
    cos += word2vec.similarity(w1,w2)
  except:
    cos += 0.5
similar = cos/len(val_target)

In [None]:
print(cos)
print(similar)

33129.58321860818
0.6296364904614132


---


**Automated Word Predictor**

**Do:**<br>


1.   Give a sentence or phrase as the input. It is better to give atleast 4 words for better predictions.
2.   Give the target words to obtain prediction accuracy



Step:<br>
1. The input line is split into words and stored as a list
2. Target words also the same
3. Input and target words converted to sequence of numbers using tokenizer
4. Input given to model_1 and first output is stored
5. First output is appended with input and given to model_2
6. Second output obtained from model_2 is stored and appended to input
7. This is given to model_3 and third output is obtained
8. All three outputs are converted to string format

In [None]:
while True:
  input_line=input("Enter a line:")
  if input_line=="Stop":
    print("The End")
    break
  else:
    target_words=input("Enter next words:")
    try:
      text_to_pred=[]
      pred_words=[]
      e_pred=[]
      target=[]
      for i in input_line.split():
        text_to_pred.append(i.lower())
      text_pred_seq=tokenizer.texts_to_sequences([text_to_pred])
      text_pred_seq=np.array(text_pred_seq)
      text_pred_seq=pad_sequences(text_pred_seq,maxlen=train_size-pred_size,dtype="int32",padding='pre',truncating='pre',value=1)
      pred_1=np.argmax(model_1.predict(text_pred_seq),axis=-1)
      e_pred.append(pred_1[0])

      w=my_vocab[pred_1[0]]
      pred_words.append(w)
      pred_1=np.array([pred_1])
      text_pred_seq=np.concatenate((text_pred_seq,pred_1),axis=1)
      text_pred_seq=pad_sequences(text_pred_seq,maxlen=train_size-pred_size,dtype="int32",padding='pre',truncating='pre',value=1)

      pred_2=np.argmax(model_2.predict(text_pred_seq),axis=-1)
      e_pred.append(pred_2[0])
      w=my_vocab[pred_2[0]]
      pred_words.append(w)
      pred_2=np.array([pred_2])
      text_pred_seq=np.concatenate((text_pred_seq,pred_2),axis=1)
      text_pred_seq=pad_sequences(text_pred_seq,maxlen=train_size-pred_size,dtype="int32",padding='pre',truncating='pre',value=1)

      pred_3=np.argmax(model_3.predict(text_pred_seq),axis=-1)
      e_pred.append(pred_3[0])
      w=my_vocab[pred_3[0]]
      pred_words.append(w)
      pred_3=np.array([pred_3])

      pred=""
      for i in pred_words:
          pred+=i
          pred+=" "
      # error
      p_error = error(target_words,pred)
      print("Percent error: %0.2f"%(p_error))     
    except:
      continue