## Implement LSTM

### Getting ready...

In [227]:
library(keras)
library(readr)
library(stringr)

Let's read our data and look at its sturcture

In [228]:
data <- read_file("data/rhyme.txt") %>% str_to_lower()

In [264]:
data

### How to do it..

In [229]:
input = as.array(data)
tokenizer = text_tokenizer(num_words = 40,char_level = F)
tokenizer %>% fit_text_tokenizer(input)

In [230]:
# saveRDS(object = tokenizer,"token_nietzsche.rds")

In [231]:
head(tokenizer$word_index)

In [232]:
text_seqs <- texts_to_sequences(tokenizer, input)

In [233]:
str(text_seqs)

List of 1
 $ : int [1:51] 2 1 4 5 6 9 10 3 11 12 ...


In [234]:
text_seqs <- text_seqs[[1]]

In [235]:
# text_seqs <- text_seqs[1:100]

In [236]:
length(text_seqs)

In [265]:
text_seqs[1:10]

In [238]:
train_length_of_sentence <- 2
feature <- matrix(ncol = train_length_of_sentence)
label <- matrix(ncol = 1)

In [239]:
# feature <- readRDS("feature_71.rds")
# label <- readRDS("label_71.rds")

In [240]:
for(i in seq(train_length_of_sentence, length(text_seqs))){
    if(i >= length(text_seqs)){
        break()
    }
    start_idx <- (i - train_length_of_sentence) +1
    end_idx <- i +1
    new_seq <-  text_seqs[start_idx:end_idx]
    feature <- rbind(feature,new_seq[1:train_length_of_sentence])
    label <- rbind(label,new_seq[train_length_of_sentence+1])
}
feature <- feature[-1,]
label <- label[-1,]

In [241]:
head(feature)

0,1
,
2.0,1.0
1.0,4.0
4.0,5.0
5.0,6.0
6.0,9.0


In [242]:
head(label)

0
""
4.0
5.0
6.0
9.0
10.0


In [244]:
dim(feature)

length(label)

In [246]:
label <- to_categorical(label,num_classes = tokenizer$num_words )

In [247]:
# saveRDS(object = feature,file = "feature_71.rds")
# saveRDS(label,file = "label_71.rds")

In [248]:
# train_index <- sample(1:nrow(feature), 0.8 * nrow(feature))
# test_index <- setdiff(1:nrow(feature), train_index)
# X_train <- feature[train_index,]
# y_train <- label[train_index]
# X_test <- feature[test_index,]
# y_test <- label[test_index]

# X_train <- to_categorical(X_train,num_classes = tokenizer$num_words )
# y_train <- to_categorical(y_train,num_classes = tokenizer$num_words )

# X_test <- to_categorical(X_test,num_classes = tokenizer$num_words )
# y_test <- to_categorical(y_test,num_classes = tokenizer$num_words )

In [249]:
cat("Shape of features",dim(feature),"\n")
cat("Shape of label",length(label))

Shape of features 49 2 
Shape of label 1960

In [250]:
model <- keras_model_sequential()

In [251]:
model %>%
    layer_embedding(input_dim = tokenizer$num_words,output_dim = 10,input_length = train_length_of_sentence) %>%
    layer_lstm(units = 50) %>%
    layer_dense(tokenizer$num_words) %>%
    layer_activation("softmax")

In [252]:
summary(model)

________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
embedding_5 (Embedding)             (None, 2, 10)                   400         
________________________________________________________________________________
lstm_5 (LSTM)                       (None, 50)                      12200       
________________________________________________________________________________
dense_5 (Dense)                     (None, 40)                      2040        
________________________________________________________________________________
activation_5 (Activation)           (None, 40)                      0           
Total params: 14,640
Trainable params: 14,640
Non-trainable params: 0
________________________________________________________________________________


In [253]:
model %>% compile(
    loss = "categorical_crossentropy", 
    optimizer = optimizer_rmsprop(lr = 0.001),
    metrics = c('accuracy')
)


In [254]:
model %>% fit(
  feature, label,
  batch_size = 128,
  epochs = 500
)

In [255]:
# save_model_hdf5(model,"lstm_v2.h5")
# library(keras)
# model <- load_model_hdf5(filepath = "lstm_v2.h5")
# summary(model)

In [256]:
# scores <- model %>% evaluate(
#   X_test, y_test,
#   batch_size = 32
# )

# cat('Test score:', scores[[1]],'\n')
# cat('Test accuracy', scores[[2]])

In [257]:
# generate a sequence from a language model
generate_sequence <-function(model, tokenizer, max_length, seed_text, n_words){
    input_text <- seed_text
    for(i in seq(n_words)){
        encoded <- texts_to_sequences(tokenizer,input_text)[[1]]
        encoded<- pad_sequences(sequences = list(encoded),maxlen = max_length,padding = 'pre')
        yhat <- predict_classes(model,encoded, verbose=0)
        next_word <- tokenizer$index_word[[as.character(yhat)]]
        input_text <- paste(input_text, next_word)
    }
    return(input_text)
}


In [258]:
seed = "Jack and"

In [259]:
generate_sequence(model,tokenizer,train_length_of_sentence,seed,11)

In [262]:
seed_2 = "Jack fell"

In [263]:
generate_sequence(model,tokenizer,train_length_of_sentence,seed_2,11)

## Part 2

In [None]:
data = read_csv("data/ArticlesFeb2018.csv")

We see that the headlines column is

In [None]:
data$headline <- as.character(data$headline)
headlines <- data$headline

In [None]:
headlines[1:10]

### How to do it...

In [None]:
input = as.array(headlines)

In [None]:
dim(input)

In [None]:
input[1]

In [None]:
tokenizer = text_tokenizer(num_words = 2000,char_level = F)
tokenizer %>% fit_text_tokenizer(input)

In [None]:
head(tokenizer$index_word)

In [None]:
text_seqs <- texts_to_sequences(tokenizer, input)

In [None]:
text_seqs[1:2]

In [None]:
input[1:2]

In [None]:
# Demonstrate mapping of index to words




In [None]:
length_of_headlines = sapply(headlines,function(x){sapply(strsplit(x, " "), length)})

In [None]:
headlines[4]

In [None]:
hist(x = length_of_headlines)

In [None]:
# Iterate through the sequences of tokens
train_length_of_sentence <- 5
feature <- matrix(ncol = train_length_of_sentence)
label <- matrix(ncol = 1)

In [None]:
for(headline in text_seqs){
    for(i in seq(train_length_of_sentence, length(headline))){
        if(i >= length(headline)){
            break()
        }
        start_idx <- (i - train_length_of_sentence) +1
        end_idx <- i +1
        new_seq <-  headline[start_idx:end_idx]
        feature <- rbind(feature,new_seq[1:train_length_of_sentence])
        label <- rbind(label,new_seq[train_length_of_sentence+1])
    }
}

In [None]:
head(feature)

In [None]:
head(label)

In [None]:
feature <- feature[-1,]
label <- label[-1,]

In [None]:
dim(feature)

In [None]:
length(label)

In [None]:
train_index <- sample(1:nrow(feature), 0.8 * nrow(feature))
test_index <- setdiff(1:nrow(feature), train_index)
X_train <- feature[train_index,]
y_train <- label[train_index]
X_test <- feature[test_index,]
y_test <- label[test_index]

In [None]:
X_train <- to_categorical(X_train,num_classes = tokenizer$num_words )
y_train <- to_categorical(y_train,num_classes = tokenizer$num_words )
X_test <- to_categorical(X_test,num_classes = tokenizer$num_words )
y_test <- to_categorical(y_test,num_classes = tokenizer$num_words )

In [None]:
cat("Shape of features",dim(X_train),"\n")
cat("Shape of features",dim(y_train))

In [None]:
model <- keras_model_sequential()

In [None]:
model %>%
  layer_lstm(8, input_shape = c(train_length_of_sentence, tokenizer$num_words)) %>%
  layer_dense(tokenizer$num_words) %>%
  layer_activation("softmax")

In [None]:
summary(model)

In [None]:
model %>% compile(
    loss = "categorical_crossentropy", 
    optimizer = optimizer_rmsprop(lr = 0.001),
    metrics = c('accuracy')
)


In [None]:
model %>% fit(
  X_train, y_train,
  batch_size = 128,
  epochs = 15
)

In [None]:
scores <- model %>% evaluate(
  X_test, y_test,
  batch_size = 32
)

cat('Test score:', scores[[1]],'\n')
cat('Test accuracy', scores[[2]])

### How it works...

### There is more...

### See also...

In [62]:
# library(keras)
# library(readr)
# library(stringr)
# library(purrr)
# library(tokenizers)

# # Parameters --------------------------------------------------------------

# maxlen <- 40

# # Data Preparation --------------------------------------------------------

# # Retrieve text
# path <- get_file(
#   'nietzsche.txt', 
#   origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt'
#   )

# # Load, collapse, and tokenize text
# text <- read_lines(path) %>%
#   str_to_lower() %>%
#   str_c(collapse = "\n") %>%
#   tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE)

# print(sprintf("corpus length: %d", length(text)))

# text <- text[1:10000]

# head(text)

# chars <- text %>%
#   unique() %>%
#   sort()

# print(sprintf("total chars: %d", length(chars)))  

# chars

# # Cut the text in semi-redundant sequences of maxlen characters
# dataset <- map(
#   seq(1, length(text) - maxlen - 1, by = 3), 
#   ~list(sentece = text[.x:(.x + maxlen - 1)], next_char = text[.x + maxlen])
#   )

# dataset <- transpose(dataset)

# head(dataset[[1]])

# str(dataset)

# length(dataset$sentece[[1]])

# length(dataset$sentece)

# 3320*40

# # Vectorization
# x <- array(0, dim = c(length(dataset$sentece), maxlen, length(chars)))
# y <- array(0, dim = c(length(dataset$sentece), length(chars)))

# for(i in 1:length(dataset$sentece)){
  
#   x[i,,] <- sapply(chars, function(x){
#     as.integer(x == dataset$sentece[[i]])
#   })
  
#   y[i,] <- as.integer(chars == dataset$next_char[[i]])
  
# }

# dim(x)

# dim(y)

# x[1,,]

# x[1,2,]

# class(x[1,2,])

# length(x[1,2,])

# y[1,]

# length(y[1,])

# # Model Definition --------------------------------------------------------

# model <- keras_model_sequential()

# model %>%
#   layer_lstm(128, input_shape = c(maxlen, length(chars))) %>%
#   layer_dense(length(chars)) %>%
#   layer_activation("softmax")

# optimizer <- optimizer_rmsprop(lr = 0.01)

# model %>% compile(
#   loss = "categorical_crossentropy", 
#   optimizer = optimizer
# )

# # Training & Results ----------------------------------------------------

# sample_mod <- function(preds, temperature = 1){
#   preds <- log(preds)/temperature
#   exp_preds <- exp(preds)
#   preds <- exp_preds/sum(exp(preds))
  
#   rmultinom(1, 1, preds) %>% 
#     as.integer() %>%
#     which.max()
# }

# on_epoch_end <- function(epoch, logs) {
  
#   cat(sprintf("epoch: %02d ---------------\n\n", epoch))
  
#   for(diversity in c(0.2, 0.5, 1, 1.2)){
    
#     cat(sprintf("diversity: %f ---------------\n\n", diversity))
    
#     start_index <- sample(1:(length(text) - maxlen), size = 1)
#     sentence <- text[start_index:(start_index + maxlen - 1)]
#     generated <- ""
    
#     for(i in 1:400){
      
#       x <- sapply(chars, function(x){
#         as.integer(x == sentence)
#       })
#       x <- array_reshape(x, c(1, dim(x)))
      
#       preds <- predict(model, x)
#       next_index <- sample_mod(preds, diversity)
#       next_char <- chars[next_index]
      
#       generated <- str_c(generated, next_char, collapse = "")
#       sentence <- c(sentence[-1], next_char)
      
#     }
    
#     cat(generated)
#     cat("\n\n")
    
#   }
# }

# print_callback <- callback_lambda(on_epoch_end = on_epoch_end)

# model %>% fit(
#   x, y,
#   batch_size = 128,
#   epochs = 1,
#   callbacks = print_callback
# )

In this section we will be working with -------- data set.It conta.........
We will start by importing the required libraries:

In [None]:
# library(keras)
# library(readr)
# library(stringr)
# # library(tm)
# # library(tokenizers)

# # Let's read our data and look at its sturcture

# data <- read_file("data/nietzsche_71.txt") %>% str_to_lower()

# data <- str_split(data, " ")

# str(data)

# data <- unlist(data)

# str(data)

# length(data)

# data <- data[1:100]

# train_length_of_sentence <- 20
# feature <- matrix(ncol = 1)
# label <- matrix(ncol = 1)

# for(i in seq(train_length_of_sentence, length(data))){
#     if(i >= length(data)){
#         break()
#     }
#     start_idx <- (i - train_length_of_sentence) +1
#     end_idx <- i +1
#     new_seq <-  data[start_idx:end_idx]
#     feature <- rbind(feature,paste(new_seq[1:train_length_of_sentence],collapse = " "))
#     label <- rbind(label,new_seq[train_length_of_sentence+1])
# }

# new_seq[train_length_of_sentence+1]

# # feature <- readRDS("feature_71.rds")

# # label <- readRDS("label_71.rds")

# head(feature)

# head(label)

# # feature <- feature[-1,]
# # label <- label[-1,]

# dim(feature)

# length(label)

# # saveRDS(object = feature,file = "feature_71.rds")

# # saveRDS(label,file = "label_71.rds")

# train_index <- sample(1:nrow(feature), 0.8 * nrow(feature))
# test_index <- setdiff(1:nrow(feature), train_index)
# X_train <- feature[train_index,]
# y_train <- label[train_index]
# X_test <- feature[test_index,]
# y_test <- label[test_index]

# str(data)

# tokenizer = text_tokenizer(num_words = 2000,char_level = F)
# tokenizer %>% fit_text_tokenizer(data)

# head(tokenizer$index_word)

# # train_seq_gen <- texts_to_sequences_generator(tokenizer,texts = c(X_train,y_train))

# # train_seq_gen_y <- texts_to_sequences_generator(tokenizer,y_train)

# X_train[1:2]

# y_train[1:2]

# texts_to_sequences(tokenizer,X_train[1:2,])

# sampling_generator <- function(X_data, Y_data = NULL, batch_size = 32) {
#     function() {
#       gc() # should blow up R if we are ever called on a background thread
#       rows <- sample(1:nrow(X_data), batch_size, replace = TRUE)
#       if (!is.null(Y_data))
#         list(texts_to_sequences(X_data[rows]), texts_to_sequences(Y_data[rows]))
#       else
#         list(texts_to_sequences(X_data[rows,]))
#     }
#   }

# model <- keras_model_sequential()

# model %>%
#   layer_lstm(8, input_shape = c(train_length_of_sentence, tokenizer$num_words)) %>%
#   layer_dense(tokenizer$num_words) %>%
#   layer_activation("softmax")

# summary(model)

# model %>% compile(
#     loss = "categorical_crossentropy", 
#     optimizer = optimizer_rmsprop(lr = 0.001),
#     metrics = c('accuracy')
# )


# model %>% fit_generator(generator = sampling_generator(X_data = X_train,Y_data = y_train,batch_size = 20),steps_per_epoch = 10)