# RNN

1. Text classification
2. Language Model

In [1]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
# from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
#     Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import regex
from torch import nn
from torch.utils.data import Dataset,DataLoader
import torch
from utils import *
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [2]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df["id"], df["text"], df["label"]

def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)["label"]

def write_predictions(file_name, pred):
    df = pd.DataFrame(zip(range(len(pred)), pred))
    df.columns = ["id", "label"]
    df.to_csv(file_name, index=False)
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def remove_punctuation(tokens):
    regpat = regex.compile(r"^[A-Za-z0-9\s]+")
    tokens = [i for i in tokens if regpat.search(i)]
    return tokens

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """

    return [ps.stem(token).lower() for token in tokens]    
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = ["<pad>", "<unk>"] + [f for f, cnt in feat_cnt.most_common(max_size-2)]
    else:
        valid_feats = ["<pad>", "<unk>"]
        for f, cnt in feat_cnt.most_common():
            #here the feature is added only if it is lesser than the max limit and greater than the min limit
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        #gets only the features until max_size
        valid_feats = valid_feats[:max_size]
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_index_vector(feats, feats_dict, max_len):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    :param feats: a list of features, type: list
    return a feature vector,
    """
    # initialize the vector as all zeros
    #If we were to pad the shorter sentences, then we can add <pad> to the rest of the positions.
    #Then in embedding we can set this as the padding_idx, so that word embeddings are 0 for this position
    vector = np.zeros(max_len, dtype=np.int64)
    for i, f in enumerate(feats):
        #Only takes in 50 features from a sentence at maximum
        if i == max_len:
            break
        # get the feature index, return 1 (<unk>) if the feature is not existed
        #1 is the position of (<unk>)
        f_idx = feats_dict.get(f, 1)
        vector[i] = f_idx
    return vector
def create_one_hot_encode_labels(labels,num_classes):
    """
    :params labels: all the labels to be one hot encoded
    """
    one_hot_vector = np.zeros(shape = (labels.shape[0],num_classes))
    labels = labels - 1
    one_hot_vector[range(len(labels)),labels] = 1
    return one_hot_vector

In [3]:
train_file = "./data/train.csv"
test_file = "./data/test.csv"
ans_file = "./data/ans.csv"
pred_file = "./data/pred.csv"
min_freq = 3

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(ans_file)
train_labels = train_labels - 1
test_labels = test_labels - 1
# extract features
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

train_stemmed = [lemmatize(tokens) for tokens in train_tokens]
test_stemmed = [lemmatize(tokens) for tokens in test_tokens]

train_stemmed = [remove_punctuation(tokens) for tokens in train_stemmed]
test_stemmed = [remove_punctuation(tokens) for tokens in test_stemmed]


train_feats = train_stemmed
test_feats = test_stemmed

In [4]:
# build a mapping from features to indices
feats_dict = get_feats_dict(
    chain.from_iterable(train_feats),
    min_freq=min_freq)

Size of features: 5137


## RNN architecture

In this tutorial, we will try to use the recurrent neural network for text classification.

![RNN for Text](rnn_for_text.png)

The RNN consists of three parts: (1) the word representation part, (2) the recurrent part, and (3) the fully connected part. The word representation part is the word embedding layer; the recurrent part includes multiple (bi-directional) recurrent layers to memorize and summarize contextualized word features; the fully connected part utilizes a multi-layer perceptron to make predictions.


### Formula

Input: $[w_1, w_2, \cdots, w_n]$

Model: 
1. Embedding layer: $[e_1, e_2, \cdots, e_n]$
2. RNN -> $[h_1, h_2, \cdots, h_n]$
3. Retrieve the last hidden state $h_n$ as the output embedding for the whole sentence.

Output layer:
1. Dense layer for classification

In [5]:
len(feats_dict)

5137

In [6]:
max_len = 50
train_feats_matrix = np.vstack([get_index_vector(feats,feats_dict,max_len) for feats in train_feats])
testfeats_matrix = np.vstack([get_index_vector(feats,feats_dict,max_len) for feats in test_feats])

In [7]:
train_feats_matrix.shape

(2000, 50)

In [8]:
#create one hot labels
# train_labels_one_hot = create_one_hot_encode_labels(train_labels,5)
# test_labels_one_hot = create_one_hot_encode_labels(test_labels,5)

In [9]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [10]:
class sentiDataset(Dataset):
    def __init__(self,data,labels):
        self.data = data
        self.labels = labels
    def __getitem__(self, index):
        return self.data[index,:],self.labels[index]
    def __len__(self):
        return len(self.data)

In [11]:
overfit_ds = sentiDataset(train_feats_matrix[:10,:],train_labels[:10])
overfit_dl = DataLoader(overfit_ds,batch_size=2)
train_ds = sentiDataset(train_feats_matrix,train_labels)
train_dl = DataLoader(train_ds,batch_size=10)
test_ds = sentiDataset(testfeats_matrix,test_labels)
test_dl = DataLoader(test_ds,batch_size=10)

In [12]:
class sentiModel(nn.Module):
    #input vector for each document will be a vector of length 50. Each element corresponds to position of word 
    #in the feature dictionary
    def __init__(self,**kwargs):
        # vocab_size,embedding_dim,hidden_size,num_rnn_layers = 1,bidirectional = False
        super(sentiModel,self).__init__()
        self.bidirectional = kwargs.pop('bidirectional',False)
        self.hidden_dim = kwargs.pop('hidden_size',50)
        self.vocab_size = kwargs.pop('vocab_size',50)
        self.num_layers = kwargs.pop('num_rnn_layers',1)
        self.embedding_dim = kwargs.pop('embedding_dim',50)
        self.num_classes = kwargs.pop('num_classes',None)
        self.num_batches = kwargs.pop('num_batches',10)
        #input will be a vector of length 50
        self.embedding_layer = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim)
        #output will be a vector of length 50x50 from embedding layer (50 words with each word embedded in a 50 dimension vector)
        #LSTM
        self.lstm = nn.LSTM(input_size = 50,hidden_size=self.hidden_dim,num_layers = self.num_layers, bidirectional = self.bidirectional,batch_first = True)
        #Fully connected layer
        if self.bidirectional:
            self.fcl = nn.Linear(in_features=self.hidden_dim*2,out_features=self.num_classes)
        else:
            self.fcl = nn.Linear(in_features=self.hidden_dim,out_features=self.num_classes)


    def forward(self,X,h0,c0):
        #out will be of shape [N,50,50]
        self.output_dict = {}
        out = self.embedding_layer(X)
        lstm_out,hidden = self.lstm(out,(h0,c0))
        #Take the final states of the backward and forward run, combine and send to a fully connected layer for classification
        if self.bidirectional:
            bidrectional_state = hidden[0][0,:,:]
            forward_state = hidden[0][1,:,:]
            #out will be now a [input_sizex(2*hidden_size)] vector
            out = torch.concat((forward_state,bidrectional_state),dim=1)
        else:
            out = hidden[0][0,:,:]
        # for layer in self.lstm_layers:
        #     out = self.lstm_layers[layer](out)
        out = self.fcl(out)
        return out

    def init_hidden(self, batch_size):
        if self.bidirectional:
            num_layers = 2*self.num_layers
        else:
            num_layers = self.num_layers
        h0 = torch.zeros((num_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((num_layers,batch_size,self.hidden_dim)).to(device)
        return h0,c0

Overfitting Dataset

In [13]:
config = {'bidirectional':True,'hidden_size':50,'vocab_size':len(feats_dict),'num_rnn_layers':1,'embedding_dim':50,'num_classes':5,'num_batches':2}
model = sentiModel(**config)
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)
train_loss_list,val_loss_list,model,accuracy_list_training,accuracy_list_validation = train_utils.train_epochs(10,model,overfit_dl,test_dl,optimizer,loss_fn,device,2,scheduler = None,decay_epochs = None)


# for batch,(X,y) in enumerate(train_dl):
#     print(X.shape)
#     print(y.shape)
#     X = X.to(device)
#     y = y.to(device)
#     print(y)
#     h0,c0 = model.init_hidden(10)
#     print(h0.shape)
#     print(c0.shape)
#     out = model(X,h0,c0)
#     print(out.shape)
#     loss = loss_fn(out,y)
#     break

TypeError: forward() missing 2 required positional arguments: 'h0' and 'c0'

In [None]:
rrrr

tensor([3, 3, 3, 1, 2, 3, 1, 4, 1, 3], device='cuda:0')

In [None]:
y

tensor([4, 3, 4, 4, 4, 2, 4, 4, 2, 3], device='cuda:0')

In [None]:
test = hidden[0]
test.shape

torch.Size([2, 10, 50])

In [None]:
torch.concat((test[0,:,:],test[1,:,:]),dim=1)

tensor([[ 5.3580e-02,  1.3755e-01,  9.5061e-02, -6.5775e-02,  1.2049e-01,
          1.3848e-01, -8.1869e-02,  3.8778e-02,  4.7551e-02, -1.4510e-02,
          1.2054e-01, -1.5620e-01,  7.4520e-02,  1.4701e-01,  1.0223e-01,
         -7.5184e-02,  1.5924e-02, -1.5599e-03,  8.1864e-02,  1.0590e-01,
         -1.1246e-01,  9.4394e-02,  1.9615e-01, -2.4086e-01, -2.9915e-01,
          8.5217e-02,  8.5487e-02,  1.4845e-01,  5.5436e-02, -4.1413e-03,
          1.0280e-01,  1.3188e-01, -1.7276e-01,  1.5872e-01,  8.1208e-02,
          1.5225e-01, -1.7707e-01, -3.8176e-02,  1.3098e-01,  6.1392e-03,
          1.0095e-02, -2.8954e-02,  1.1388e-01, -5.4023e-02,  1.2452e-01,
          1.7050e-01, -8.0620e-02, -6.5026e-02,  4.2094e-02,  2.4557e-01,
         -9.2265e-02,  1.2225e-01, -1.0339e-01, -1.8640e-01, -9.5648e-02,
         -2.5018e-02,  3.3677e-01,  8.6114e-02, -1.9252e-02,  3.3813e-03,
         -8.3440e-02,  6.5190e-02,  3.0300e-01, -7.3911e-03, -2.3086e-02,
         -2.3794e-01,  9.2190e-02, -1.

In [None]:
out[:,0,:]

tensor([[ 1.4906e-01, -1.4046e-01, -4.5302e-02, -1.9351e-02,  7.6776e-02,
          1.2641e-01, -9.2020e-02, -5.4868e-02, -1.3931e-01, -1.8051e-01,
          1.4200e-01, -3.7571e-02,  1.5239e-01,  1.8419e-01, -6.1538e-02,
         -9.9384e-02,  9.7652e-04,  4.2894e-03, -2.0926e-01,  6.9417e-02,
         -1.1370e-01, -8.3564e-02, -2.5141e-02, -1.3878e-02, -2.0062e-01,
         -8.5827e-03,  5.9929e-02,  1.4097e-01,  1.4491e-01, -1.7105e-01,
          1.0871e-01, -1.0890e-02,  1.9896e-01, -8.0862e-02,  7.6227e-02,
          3.4171e-02, -1.7284e-01,  1.5071e-01,  6.8803e-02, -3.0078e-02,
          1.3711e-01, -1.0802e-01,  4.4750e-02, -4.2957e-02,  5.1549e-02,
         -5.4431e-02, -6.1013e-02, -7.6933e-02,  1.0767e-01,  1.0803e-01,
         -9.2265e-02,  1.2225e-01, -1.0339e-01, -1.8640e-01, -9.5648e-02,
         -2.5018e-02,  3.3677e-01,  8.6114e-02, -1.9252e-02,  3.3813e-03,
         -8.3440e-02,  6.5190e-02,  3.0300e-01, -7.3911e-03, -2.3086e-02,
         -2.3794e-01,  9.2190e-02, -1.

In [None]:
out[:,-1,:]

tensor([[ 5.3580e-02,  1.3755e-01,  9.5061e-02, -6.5775e-02,  1.2049e-01,
          1.3848e-01, -8.1869e-02,  3.8778e-02,  4.7551e-02, -1.4510e-02,
          1.2054e-01, -1.5620e-01,  7.4520e-02,  1.4701e-01,  1.0223e-01,
         -7.5184e-02,  1.5924e-02, -1.5599e-03,  8.1864e-02,  1.0590e-01,
         -1.1246e-01,  9.4394e-02,  1.9615e-01, -2.4086e-01, -2.9915e-01,
          8.5217e-02,  8.5487e-02,  1.4845e-01,  5.5436e-02, -4.1413e-03,
          1.0280e-01,  1.3188e-01, -1.7276e-01,  1.5872e-01,  8.1208e-02,
          1.5225e-01, -1.7707e-01, -3.8176e-02,  1.3098e-01,  6.1392e-03,
          1.0095e-02, -2.8954e-02,  1.1388e-01, -5.4023e-02,  1.2452e-01,
          1.7050e-01, -8.0620e-02, -6.5026e-02,  4.2094e-02,  2.4557e-01,
         -9.7406e-02, -1.2245e-01, -9.2123e-02,  6.6646e-02,  5.2326e-02,
          2.1714e-01,  2.6614e-01, -5.6376e-02, -3.8756e-02, -7.0547e-03,
         -1.8122e-01, -1.8313e-01,  1.9253e-02, -1.2013e-01,  1.0899e-01,
         -1.3712e-02, -1.4755e-01, -1.

In [None]:
for name,param in model.named_parameters():
    print(name)
    print(getattr(model, name.split('.')[0]))

embedding_layer.weight
Embedding(50, 50)
lstm.weight_ih_l0
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.weight_hh_l0
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_ih_l0
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_hh_l0
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.weight_ih_l0_reverse
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.weight_hh_l0_reverse
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_ih_l0_reverse
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_hh_l0_reverse
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.weight_ih_l1
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.weight_hh_l1
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_ih_l1
LSTM(50, 50, num_layers=2, batch_first=True, bidirectional=True)
lstm.bias_hh_l1
LSTM(50,

In [None]:
import regex as re

text = "Hi I'm , #&&&lolol"

re.sub(r"[^a-zA-Z0-9\s]+","",text)

'Hi Im  lolol'

In [None]:
layer = nn.Embedding(num_embeddings=len(feats_dict),embedding_dim=50)
layer(torch.tensor(train_feats_matrix[0])).shape

torch.Size([50, 50])

In [None]:
train_feats_matrix[0]

array([  42,   47, 1109,    5,  210,   12, 2311,    3,   80,   25,   41,
        390,    6, 2312,   61,    1,  327,  316, 2585,   22,    2,  384,
          8,    2,  418,   27, 2313,    3,   28,   80,   25,  183,    5,
          1,   11,    2,    1, 2087,    4,  243,    1,    1,   22,  675,
          3, 1145,   61, 2088,   22,  665], dtype=int64)

# Language Model

An RNN Language model is provided here.

Input:
- word tokens $[w_1, w_2, \cdots, w_n]$

Model：
- embedding layer: get the representation of all the words as $[e_1, e_2, \cdots, e_n]$.
- RNN: get the hidden representation of the sentence $[h_1, h_2, \cdots, h_n]$.
- Objective: Minimize the log probability of the sentence.


**Chain rule:**

$P(w_1w_2\cdots w_n) = P(w_1)P(w_2|w_1)P(w_3|w_1w_2)\cdots$

**Markov approximation**

$P(w_1w_2\cdots w_n) \approx P(w_1|w_0)P(w_2|w_1)P(w_3|w_2)\cdots$

$P(w_1w_2\cdots w_n) \approx \prod_{i=1}^{n}P(w_i|h_i)$

In [None]:
from ptb_loader import load_data
from keras.callbacks import ModelCheckpoint, Callback

class TestCallback(Callback):
    """
    Calculate Perplexity
    """
    def __init__(self, test_data, model):
        self.test_data = test_data
        self.model = model
    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        x_probs = self.model.predict(x)
        ppl = self.evaluate_batch_ppl(x_probs,y)
        print('\nValidation Set Perplexity: {0:.2f} \n'.format(ppl))
    @staticmethod
    def evaluate_ppl(x, y):
        x = x.reshape(-1, x.shape[-1])
        y = y.reshape(-1)
        return np.exp(np.mean(-np.log(np.diag(x[:, y]))))
    def evaluate_batch_ppl(self, x, y):
        eval_batch_size = 8
        x = x.reshape(-1, x.shape[-1])
        y = y.reshape(-1)
        ppl = 0.0
        for i in range(math.ceil(len(x)/eval_batch_size)):
            batch_x = x[i*eval_batch_size:(i+1)*eval_batch_size,:]
            batch_y = y[i*eval_batch_size:(i+1)*eval_batch_size]
            ppl += np.sum(np.log(np.diag(batch_x[:, batch_y])))
        return np.exp(-ppl/x.shape[0])

print('Loading data')
x_train, y_train, x_valid, y_valid, vocabulary_size, vocab = load_data()

print(x_train.shape)
print(y_train.shape)

num_training_data = x_train.shape[0]
sequence_length = x_train.shape[1]

print('Vocab Size',vocabulary_size)

Loading data
(32389, 30)
(32389, 30, 1)
Vocab Size 9860


In [None]:
# training parameters
drop = 0.5
epochs = 10
batch_size = 8
embedding_dim = 10

# lstm parameters
hidden_size = 10

inputs = Input(shape=(sequence_length,), dtype='int32')
# inputs -> [batch_size, sequence_length]

emb_layer = Embedding(input_dim=vocabulary_size, 
                    output_dim=embedding_dim, 
                    input_length=sequence_length)
# emb_layer.trainable = False
# if you uncomment this line, the embeddings will be untrainable

embedding = emb_layer(inputs)
# embedding -> [batch_size, sequence_length, embedding_dim]

drop_embed = Dropout(drop)(embedding) 
# dropout at embedding layer

# add a LSTM here, set units=hidden_size, return_sequences=True
# Boolean. Whether to return the last output. in the output sequence, or the full sequence.
lstm_out_1 = LSTM(units=hidden_size, return_sequences=True)(drop_embed)
# NER [tag1, tag2, tag3, ...]
# output: lstm_out_1 -> [batch_size, sequence_length, hidden_size]


# add a TimeDistributed here, set layer = Dense(units=vocabulary_size,activation='softmax')
# please read  https://keras.io/layers/wrappers/
# output: outputs -> [batch_size, sequence_length, vocabulary_size]
outputs = TimeDistributed(Dense(units=vocabulary_size,
    activation='softmax'))(lstm_out_1)
# [batch_size, sequence_length, output_size]

# End of Model Architecture
# ----------------------------------------#

In [None]:
model = Model(inputs=inputs, outputs=outputs)

adam = keras.optimizers.Adam()
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)

print(model.summary())

print("Traning Model...")
history = model.fit(
        x_train, 
        y_train, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1,
        callbacks=[TestCallback((x_valid,y_valid),model=model)])

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 10)            98600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 10)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 10)            840       
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 9860)          108460    
Total params: 207,900
Trainable params: 207,900
Non-trainable params: 0
_________________________________________________________________
None
Traning Model...
Epoch 1/10
Validation Set Perplexity: 450.09 

Epoch 2/10
Validation Set Perplexity: 365.04 

## Test perplexity

In [None]:
sent_1 = [vocab[s] for s in "i visited the campus last monday".split()]
sent_2 = [vocab[s] for s in "i visited the campus last pizza".split()]
sent_1_input = np.expand_dims(np.array(sent_1 + [0] * (x_train.shape[1]-len(sent_1))), 0)
sent_2_input = np.expand_dims(np.array(sent_2 + [0] * (x_train.shape[1]-len(sent_2))), 0)
sent_1_y = np.expand_dims([sent_1[1:]+[sent_1[0]]], -1)
sent_2_y = np.expand_dims([sent_2[1:]+[sent_2[0]]], -1)

In [None]:
print("perplexity of sentence 1:", TestCallback.evaluate_ppl(model.predict(sent_1_input)[:len(sent_1)], 
                          sent_1_y))
print("perplexity of sentence 2:", TestCallback.evaluate_ppl(model.predict(sent_2_input)[:len(sent_1)], 
                          sent_2_y))

perplexity of sentence 1: 2119.398
perplexity of sentence 2: 3703.2559
