In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import itertools
from glove import Corpus, Glove

In [None]:
tokenizer = Tokenizer()

In [None]:
def training_glove_weights(data):
    corpus = Corpus() 
    corpus.fit(data,window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')

In [None]:
def getting_glove_weights():
    glove_model=Glove.load('glove.model')
    embedding_matrix2 = dict()
    embedding_matrix2 = glove_model.dictionary
    return embedding_matrix2

In [None]:
def getting_pretrained_glove_weights():
    embeddings_index = dict()
    f = open('glove.6B/glove.6B.100d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    embedding_matrix = np.zeros((vocabulary_size, 100))
    for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
    return embedding_matrix

In [None]:
def dataset_preparation(data):
    corpus = data.lower().split("\n")
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1,len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,maxlen = max_sequence_len, padding = 'pre'))
    predictors = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    labels = ku.to_categorical(labels,num_classes=total_words)
    return predictors,labels,max_sequence_len,total_words

In [None]:
def create_model(predictors,labels,max_sequence_len,total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 100 , input_length = input_len))
    model.add(LSTM(512))
    model.add(LSTM(512))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    model.fit(predictors,labels,epochs=100,verbose=1)
    print(model.summary)
    return model

In [None]:
def create_model_glove(predictors,labels,max_sequence_len,total_words,embedding_matrix):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length = input_len , weights=[embedding_matrix], trainable=False))
    model.add(LSTM(512))
    model.add(LSTM(512))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    model.fit(predictors,labels,epochs=100,verbose=1)
    print(model.summary)
    return model

In [None]:
def generate_conditioned_text(input_words,no_of_next_words,max_sequence_len,model):
    for j in range(no_of_next_words):
        token_list = tokenizer.texts_to_sequences([input_words])[0]
        token_list = pad_sequences([token_list],maxlen = max_sequence_len-1,padding='pre')
        predicted = model.predict_classes(token_list,verbose = 0)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if(index == predicted):
                output_word = word
                break
        input_words += " "+output_word
    return input_words

In [None]:
def generate_unconditioned_text(no_of_next_words,max_sequence_len,model):
    for j in range(no_of_next_words):
        token_list = pad_sequences([],maxlen = max_sequence_len-1,padding='pre')
        predicted = model.predict_classes(token_list,verbose = 0)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if(index == predicted):
                output_word = word
                break
        input_words += " "+output_word
    return input_words

In [None]:
predictors,labels,max_sequence_len,total_words = dataset_preparation(data)
model = create_model(predictors,labels,max_sequence_len,total_words)

In [None]:
embedding_matrix = getting_pretrained_glove_weights()
glove_model = create_model_glove(predictors,labels,max_sequence_len,total_words,embedding_matrix)

In [None]:
training_glove_weights(data)
embedding_matrix2 = getting_glove_weights()
glove_model2 = create_model_glove(predictors,labels,max_sequence_len,total_words,embedding_matrix2)

In [None]:
output = generate_conditioned_text("we are",3,max_sequence_len,model)
print(output)

In [None]:
output = generate_unconditioned_text(3,max_sequence_len,model)
print(output)

In [None]:
output = generate_conditioned_text("we are",3,max_sequence_len,glove_model)
print(output)

In [None]:
output = generate_unconditioned_text("we are",3,max_sequence_len,glove_model)
print(output)