In [1]:
# importing the required libraries and reading our file.
library(keras)
library(readr)
library(stringr)
data <- read_file("data/rhyme.txt") %>% str_to_lower()

In [2]:
data

In [3]:
# defining tokenizer
tokenizer = text_tokenizer(num_words = 35,char_level = F)
tokenizer %>% fit_text_tokenizer(data)

In [4]:
tokenizer

<keras_preprocessing.text.Tokenizer>

In [5]:
# number of unique words in our corpus
cat("Number of unique words", length(tokenizer$word_index))

Number of unique words 37

In [6]:
# vocabulary
head(tokenizer$word_index)

In [7]:
# converting our corpus into a integer sequence
text_seqs <- texts_to_sequences(tokenizer, data)
str(text_seqs)

List of 1
 $ : int [1:48] 2 1 4 5 6 9 10 3 11 12 ...


In [8]:
text_seqs <- text_seqs[[1]]
length(text_seqs)

In [9]:
max(text_seqs)

In [10]:
# converting text sequence into an input(feature) and output(labels) sequences
input_sequence_length <- 2
feature <- matrix(ncol = input_sequence_length)
label <- matrix(ncol = 1)

for(i in seq(input_sequence_length, length(text_seqs))){
    if(i >= length(text_seqs)){
        break()
    }
    start_idx <- (i - input_sequence_length) +1
    end_idx <- i +1
    new_seq <-  text_seqs[start_idx:end_idx]
    feature <- rbind(feature,new_seq[1:input_sequence_length])
    label <- rbind(label,new_seq[input_sequence_length+1])
}
feature <- feature[-1,]
label <- label[-1,]

In [11]:
paste("Feature")
head(feature)
paste("label")
head(label)

0,1
2,1
1,4
4,5
5,6
6,9
9,10


In [12]:
# one-hot-encoding our label
label <- to_categorical(label,num_classes = tokenizer$num_words )

In [13]:
cat("Shape of features",dim(feature),"\n")
cat("Shape of label",length(label))

Shape of features 46 2 
Shape of label 1610

In [14]:
# creating our neural network
model <- keras_model_sequential()
model %>%
    layer_embedding(input_dim = tokenizer$num_words,output_dim = 10,input_length = input_sequence_length) %>%
    layer_lstm(units = 50) %>%
    layer_dense(tokenizer$num_words) %>%
    layer_activation("softmax")

summary(model)

Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
embedding (Embedding)               (None, 2, 10)                   350         
________________________________________________________________________________
lstm (LSTM)                         (None, 50)                      12200       
________________________________________________________________________________
dense (Dense)                       (None, 35)                      1785        
________________________________________________________________________________
activation (Activation)             (None, 35)                      0           
Total params: 14,335
Trainable params: 14,335
Non-trainable params: 0
________________________________________________________________________________


In [15]:
# compiling the model
model %>% compile(
    loss = "categorical_crossentropy", 
    optimizer = optimizer_rmsprop(lr = 0.001),
    metrics = c('accuracy')
)

# training the model
model %>% fit(
  feature, label,
#   batch_size = 128,
  epochs = 500
)

In [16]:
# function to generate a sequence from a language model
generate_sequence <-function(model, tokenizer, input_length, seed_text, predict_next_n_words){
    input_text <- seed_text
    for(i in seq(predict_next_n_words)){
        encoded <- texts_to_sequences(tokenizer,input_text)[[1]]
        encoded <- pad_sequences(sequences = list(encoded),maxlen = input_length,padding = 'pre')
        yhat <- predict_classes(model,encoded, verbose=0)
        next_word <- tokenizer$index_word[[as.character(yhat)]]
        input_text <- paste(input_text, next_word)
    }
    return(input_text)
}


In [17]:
# using generate_sequence() to generate text from sequences
seed_1 = "Jack and"
cat("Text generated from seed 1: " ,generate_sequence(model,tokenizer,input_sequence_length,seed_1,11),"\n ")
seed_2 = "Jack fell"
cat("Text generated from seed 2: ",generate_sequence(model,tokenizer,input_sequence_length,seed_2,11))

Text generated from seed 1:  Jack and jill went up the hill to fetch a pail of water 
 Text generated from seed 2:  Jack fell down and broke his crown and jill went up the hill