In [1]:
# %load_ext autoreload
# %autoreload 2

# from IPython.core.interactiveshell import InteractiveShell
# from IPython.display import HTML

# InteractiveShell.ast_node_interactivity = 'all'

# from keras import Model
# from keras.models import load_model
# import warnings
# warnings.filterwarnings('ignore', category = RuntimeWarning)
# warnings.filterwarnings('ignore', category = UserWarning)

# BATCH_SIZE = 2048
# RANDOM_STATE = 50

# import numpy as np
# import pandas as pd
# from utils import get_model, find_closest, get_sequences, create_train_valid,  generate_output, guess_human

In [2]:
# Set up IPython to show all outputs from a cell
import warnings
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

warnings.filterwarnings('ignore', category=RuntimeWarning)

RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
VERBOSE = 0
SAVE_MODEL = True
RNN_CELLS = 128

import pandas as pd
import numpy as np

# Read in data
data = pd.read_csv(
    '../data/neural_network_patent_query.csv', parse_dates=['patent_date'])

# Extract abstracts
original_abstracts = list(data['patent_abstract'])

In [3]:
def make_sequences(texts,
                   training_length=50,
                   lower=True,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)

    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []

    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length:i + 1]

            # Set the features and label
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    print(f'There are {len(training_seq)} training sequences.')

    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels

In [4]:
from keras.preprocessing.text import Tokenizer

example = 'This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have an image and has two commas.'
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)

Using TensorFlow backend.


'this is a short sentence 1 with one reference to an image this next sentence while non sensical does not have an image and has two commas'

In [5]:
tokenizer = Tokenizer(filters='"#$%&*+/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)
tokenizer.word_index.keys()

'this is a short sentence (1) with one reference to an image. this next sentence, while non-sensical, does not have an image and has two commas.'

dict_keys(['this', 'an', 'is', 'a', 'short', 'sentence', '(1)', 'with', 'one', 'reference', 'to', 'image.', 'next', 'sentence,', 'while', 'non-sensical,', 'does', 'not', 'have', 'image', 'and', 'has', 'two', 'commas.'])

In [6]:
import re


def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', patent)

    # Remove references to figures
    patent = re.sub(r'\((\d+)\)', r'', patent)

    # Remove double spaces
    patent = re.sub(r'\s\s', ' ', patent)
    return patent


f = format_patent(example)
f

'This is a short sentence with one reference to an image . This next sentence , while non-sensical , does not have an image and has two commas .'

In [7]:
formatted = []

# Iterate through all the original abstracts
for a in original_abstracts:
    formatted.append(format_patent(a))

len(formatted)

3522

In [8]:
TRAINING_LENGTH = 50

filters = '!"%;[\\]^_`{|}~\t\n'
word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels = make_sequences(
    formatted, TRAINING_LENGTH, lower=False, filters=filters)

There are 16192 unique words.
There are 318563 training sequences.


In [9]:
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:15]

[('the', 30760),
 ('a', 21442),
 ('of', 20157),
 ('.', 16554),
 (',', 15415),
 ('and', 12563),
 ('to', 12012),
 ('network', 7618),
 ('neural', 7235),
 ('is', 7211),
 ('for', 6779),
 ('in', 6131),
 ('The', 5813),
 ('an', 5286),
 ('data', 3971)]

In [10]:
from sklearn.utils import shuffle


def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=TRAIN_FRACTION):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc
    gc.enable()
    del features, labels, train_features, valid_features, train_labels, valid_labels
    gc.collect()

    return X_train, X_valid, y_train, y_valid

In [11]:
import os
from keras.utils import get_file

# Vectors to use
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.zip'

# Download word embeddings if they are not present
if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip',
                             'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')

# Load in unzipped file
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape
glove[0]

(400000, 101)

array(['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172',
       '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459',
       '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231',
       '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336',
       '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971',
       '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722',
       '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397',
       '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531',
       '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477',
       '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205',
       '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167',
       '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079',
       '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044',
       '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972',
       '0.15006', '-0.53

In [12]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

del glove

vectors[100], words[100]

(array([-3.9551e-01,  5.4660e-01,  5.0315e-01, -6.3682e-01, -4.5470e-01,
         3.0889e-01, -4.9240e-02,  2.7191e-01,  3.1562e-01, -3.2879e-01,
         2.5089e-01,  1.4508e-01,  3.5136e-01, -2.2793e-01, -1.5894e-01,
        -5.1527e-01, -2.7978e-01,  3.6470e-01, -3.9425e-01,  3.3299e-01,
         4.3051e-01,  1.8300e-01,  2.5095e-01, -1.8547e-01,  3.4698e-01,
         5.5137e-02, -4.5979e-01, -8.2963e-01, -1.8523e-02, -3.6772e-01,
         4.5566e-02,  7.1052e-01, -2.2782e-02, -8.0889e-02,  2.0685e-01,
         4.9855e-01, -5.9794e-02, -8.0048e-03, -2.3823e-01, -3.3759e-01,
        -2.4201e-01, -2.3788e-01, -1.1362e-03, -4.0395e-01, -4.4859e-01,
        -3.2189e-01,  4.8405e-01, -2.7999e-02,  1.0148e-01, -9.3585e-01,
        -8.7522e-02, -3.9959e-01,  3.6545e-01,  1.3726e+00, -3.0713e-01,
        -2.5940e+00,  2.2431e-01, -4.1168e-02,  1.7765e+00,  4.0010e-01,
        -1.0996e-01,  1.4178e+00, -2.6154e-01,  1.8617e-01,  7.9328e-01,
        -1.1709e-01,  8.7541e-01,  4.3911e-01,  3.4

In [13]:
word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

There were 6317 words without pre-trained embeddings.


In [14]:
embedding_matrix = np.zeros((num_words, len(word_lookup['the'])))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')
embedding_matrix.shape

There were 6317 words without pre-trained embeddings.


(16192, 100)

In [15]:
# Split into training and validation
X_train, X_valid, y_train, y_valid = create_train_valid(
    features, labels, num_words)
X_train.shape, y_train.shape

((222994, 50), (222994, 16192))

In [16]:
import sys
sys.getsizeof(y_train) / 1e9

def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')

check_sizes(gb_min=1)

3.61071896

Object: y_train   	Size: 3.61071896 GB.
Object: y_valid   	Size: 1.54745336 GB.


In [17]:
# model
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional, SimpleRNN
from keras.optimizers import Adam

from keras.utils import plot_model

def make_word_level_model(num_words,
                          embedding_matrix,
                          rnn_cells=128,
                          trainable=False,
                          rnn_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of RNN cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple RNN layers
    if rnn_layers > 1:
        for i in range(rnn_layers - 1):
            model.add(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final RNN cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            SimpleRNN(
                rnn_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
#     model.add(Dense(128, activation='relu'))
#     # Dropout for regularization
#     model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model


model = make_word_level_model(
    num_words,
    embedding_matrix=embedding_matrix,
    rnn_cells=RNN_CELLS,
    trainable=False,
    rnn_layers=1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
masking_1 (Masking)          (None, None, 100)         0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               29312     
_________________________________________________________________
dense_1 (Dense)              (None, 16192)             2088768   
Total params: 3,737,280
Trainable params: 2,118,080
Non-trainable params: 1,619,200
_________________________________________________________________


In [18]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

BATCH_SIZE = 2048
model_name = 'train-embeddings-rnn-50'
model_dir = '../my_models/'

def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [19]:
def load_and_evaluate(model_name, return_model=False):
    """Load in a trained model and evaluate with log loss and accuracy"""

    model = load_model(f'{model_dir}{model_name}.h5')
    r = model.evaluate(X_valid, y_valid, batch_size=2048, verbose=1)

    valid_crossentropy = r[0]
    valid_accuracy = r[1]

    print(f'Cross Entropy: {round(valid_crossentropy, 4)}')
    print(f'Accuracy: {round(100 * valid_accuracy, 2)}%')

    if return_model:
        return model

In [20]:
model = load_and_evaluate(model_name, return_model=True)

Cross Entropy: 4.9522
Accuracy: 23.44%


In [21]:
from IPython.display import HTML


def header(text, color='black'):
    raw_html = f'<h1 style="color: {color};"><center>' + \
        str(text) + '</center></h1>'
    return raw_html


def box(text):
    raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + \
        str(text)+'</div>'
    return raw_html


def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

In [22]:
import random


def generate_output(model,
                    sequences,
                    training_length=50,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Choose a random sequence
    seq = random.choice(sequences)

    # Choose a random starting point
    seed_idx = random.randint(0, len(seq) - training_length - 10)
    # Ending index for seed
    end_idx = seed_idx + training_length

    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
        generated = seed[:] + ['#']

        # Find the actual entire sequence
        actual = generated[:] + seq[end_idx:end_idx + new_words]

        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)

            # Diversify
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]

            next_idx = np.argmax(probas)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        return original_sequence, gen_list, a

    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [24]:
def remove_spaces(patent):
    """Remove spaces around punctuation"""
    patent = re.sub(r'\s+([.,;?])', r'\1', patent)

    return patent

In [25]:
seed_html, gen_html, a_html = generate_output(model, sequences,
                                              TRAINING_LENGTH)
HTML(seed_html)
HTML(gen_html)
HTML(a_html)

In [162]:
import random


def generate_next(model,
                    sequence,
                    ans,
                    training_length=50,
                    new_words=1,
                    diversity=1,
                    return_output=False,
                    return_idx=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""
    """
    sequence: a tokenized sentence, e.g.[102,3,2314,3,...], with a length of training_length
    ans: the real next word
    """
    # Choose a random sequence
    seq = list(sequence)
    seed_idx = 0
    end_idx = training_length
    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
#         print(seed.shape, type(seed), type(['#']))
        generated = seed[:] + ['#']

        # Find the actual entire sequence
#         actual = generated[:] + seq[end_idx:end_idx + new_words]
        actual = generated[:] + [ans]
    
        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)
#             print(preds.shape, "\n", preds)
            # Diversify
#             preds = np.log(preds) / diversity
#             exp_preds = np.exp(preds)
#             print(exp_preds.shape, exp_preds)

            # Softmax
#             preds = exp_preds / sum(exp_preds)

            # Choose the next word
#             probas = np.random.multinomial(1, preds, 1)[0]

#             next_idx = np.argmax(probas)
            next_idx = np.argmax(preds)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        if return_idx == False:
            return original_sequence, gen_list, a
        else:
#             print(original_sequence, gen_list, a)
#             print(original_sequence[0], gen_list[0][1], a[1])
            return [word_idx[ele] for ele in original_sequence], word_idx[gen_list[0][1]], word_idx[a[1]]
    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [163]:
X_valid.shape
y_valid.shape

(95569, 50)

(95569, 16192)

In [164]:
X_valid[0]

array([ 633, 5033,   21,    1, 4117,   30,  633, 3770,   12,    1,   22,
        122,    4, 5478,    1,   44,    3,   25, 5696,   30,    5,    1,
       4117,   30,  633, 4039,    7,    1,   22,  122,    3,    1,  810,
        216, 8321,   10, 3188,   17,   14, 2829,  741,    5,  366,    1,
       4117,   30,  633,  182, 4039,    7])

In [174]:


for i in range(0, 10):
    # turn one-hot to index1
    # print(np.where(y_valid[1110]==1))
    seed_html, gen_html, a_html = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1)
    HTML(seed_html)
    HTML(gen_html)
    HTML(a_html)

    seed, rnn_generate, actual = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1, return_output=True, return_idx=True)
    print("seed:\n", seed)
    print("predicted: ", rnn_generate)
    print("actual: ", actual)

seed:
 [633, 5033, 21, 1, 4117, 30, 633, 3770, 12, 1, 22, 122, 4, 5478, 1, 44, 3, 25, 5696, 30, 5, 1, 4117, 30, 633, 4039, 7, 1, 22, 122, 3, 1, 810, 216, 8321, 10, 3188, 17, 14, 2829, 741, 5, 366, 1, 4117, 30, 633, 182, 4039, 7]
predicted:  1
actual:  1


seed:
 [21, 609, 514, 1, 124, 16, 1276, 7, 360, 399, 250, 154, 320, 2, 752, 15, 42, 24, 1, 92, 10, 663, 4, 13, 20, 10, 1015, 6, 1273, 5, 425, 342, 46, 1220, 7, 1, 3468, 4911, 3, 9, 116, 151, 28, 1, 34, 450, 3165, 1, 622, 3]
predicted:  1
actual:  15452


seed:
 [8, 5, 95, 33, 23, 74, 3, 6097, 4526, 1, 87, 3, 70, 12, 2, 185, 39, 6097, 5060, 1, 87, 3, 185, 190, 6097, 6765, 1, 222, 3, 661, 213, 12, 1, 190, 6, 6097, 9712, 1, 121, 1012, 1, 70, 1715, 5754, 1045, 1, 82, 9, 8, 10]
predicted:  73
actual:  73


seed:
 [12, 33, 3, 678, 1487, 24, 1, 351, 89, 4, 2198, 2178, 245, 89, 5, 1, 52, 1544, 4647, 1, 92, 7, 2129, 12, 33, 3, 678, 1487, 4, 467, 1286, 3929, 635, 7260, 2367, 1, 92, 84, 1791, 2, 739, 17, 2870, 638, 367, 35, 2, 89, 181, 84]
predicted:  257
actual:  257


seed:
 [6, 245, 42, 3, 277, 773, 27, 182, 497, 5, 1, 134, 441, 3, 50, 332, 4, 149, 962, 3, 2631, 10, 2428, 17, 432, 613, 129, 7, 1, 277, 23, 43, 15, 4, 7078, 1117, 276, 16, 88, 153, 5, 1544, 36, 2, 2907, 43, 2529, 5, 11, 1249]
predicted:  1805
actual:  1805


seed:
 [1, 1272, 30, 4, 13, 374, 41, 2, 2802, 708, 25, 1396, 1, 132, 51, 69, 2, 2802, 34, 281, 12064, 4, 64, 2, 609, 166, 5, 1, 374, 40, 142, 2, 713, 764, 2733, 12065, 11, 494, 2802, 2890, 50, 24, 1, 12066, 12, 1, 165, 3, 80, 42]
predicted:  4
actual:  534


seed:
 [1820, 121, 7, 51, 29, 1, 322, 15, 22, 443, 12, 1, 79, 244, 6, 538, 1, 305, 51, 5, 6, 14, 547, 65, 11, 538, 2, 779, 484, 3, 1, 103, 3, 455, 307, 116, 4, 13, 322, 15, 22, 443, 16, 546, 7, 165, 681, 6, 1, 547]
predicted:  3
actual:  65


seed:
 [4479, 7, 2, 256, 1029, 1, 2165, 256, 21, 2, 376, 1152, 890, 4, 13, 252, 35, 25, 1, 5872, 256, 12664, 7, 1, 256, 3, 1, 2165, 10, 231, 4, 149, 605, 2627, 3, 1, 5872, 10, 423, 754, 17, 2, 1152, 3660, 29, 1, 5872, 1222, 4, 1509]
predicted:  13
actual:  3


seed:
 [739, 129, 303, 27, 335, 27, 17, 892, 5615, 4, 76, 82, 9, 8, 802, 10, 73, 17, 2083, 1, 382, 48, 6, 1, 3568, 8876, 48, 4, 64, 2, 839, 676, 5, 3845, 231, 48, 16, 513, 4, 18, 615, 3, 2676, 105, 48, 10, 54, 11, 43, 1]
predicted:  9
actual:  439


seed:
 [156, 1474, 11, 2, 59, 315, 10, 136, 8723, 12, 1, 9, 8, 12, 157, 2122, 4, 1443, 5, 181, 8724, 1222, 27, 33, 23, 74, 436, 3, 2111, 5, 32, 3, 25, 555, 2, 8725, 1160, 6933, 3, 1, 59, 315, 4, 4295, 5, 1, 568, 35, 32, 225]
predicted:  3
actual:  10


In [202]:
# save results

org_seq = []
rnn_result = []
actual_result= []

for i in range(0, 1000):

    seed, rnn_generate, actual = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1, return_output=True, return_idx=True)
    print("sentence {}:".format(i) )
#     print("seed:\n", seed)
    print("predicted: ", rnn_generate)
    print("actual: ", actual, '\n')
    org_seq.append(seed)
    rnn_result.append(rnn_generate)
    actual_result.append(actual)
    

org_seq = np.array(org_seq, dtype=int)
rnn_result = np.array(rnn_result, dtype=int)
actual_result = np.array(actual_result, dtype=int)

type(org_seq[0][0])

# np.savetxt("./C_test_set/org_seq.txt", org_seq)
# np.savetxt("./C_test_set/rnn_result.txt", rnn_result)
# np.savetxt("./C_test_set/actual_result.txt", actual_result)

np.save("./C_test_set/org_seq", org_seq)
np.save("./C_test_set/rnn_result", rnn_result)
np.save("./C_test_set/actual_result", actual_result)

sentence 0:
predicted:  1
actual:  1 

sentence 1:
predicted:  1
actual:  15452 

sentence 2:
predicted:  73
actual:  73 

sentence 3:
predicted:  257
actual:  257 

sentence 4:
predicted:  1805
actual:  1805 

sentence 5:
predicted:  4
actual:  534 

sentence 6:
predicted:  3
actual:  65 

sentence 7:
predicted:  13
actual:  3 

sentence 8:
predicted:  9
actual:  439 

sentence 9:
predicted:  3
actual:  10 

sentence 10:
predicted:  1
actual:  479 

sentence 11:
predicted:  6
actual:  12 

sentence 12:
predicted:  3
actual:  3 

sentence 13:
predicted:  198
actual:  538 

sentence 14:
predicted:  72
actual:  145 

sentence 15:
predicted:  7
actual:  8 

sentence 16:
predicted:  6113
actual:  29 

sentence 17:
predicted:  1684
actual:  333 

sentence 18:
predicted:  13
actual:  611 

sentence 19:
predicted:  1
actual:  99 

sentence 20:
predicted:  1
actual:  1 

sentence 21:
predicted:  16
actual:  530 

sentence 22:
predicted:  32
actual:  143 

sentence 23:
predicted:  22
actual:  4

sentence 200:
predicted:  3
actual:  3 

sentence 201:
predicted:  17
actual:  7 

sentence 202:
predicted:  4665
actual:  201 

sentence 203:
predicted:  74
actual:  74 

sentence 204:
predicted:  9
actual:  671 

sentence 205:
predicted:  13
actual:  904 

sentence 206:
predicted:  7253
actual:  1082 

sentence 207:
predicted:  10
actual:  148 

sentence 208:
predicted:  258
actual:  682 

sentence 209:
predicted:  3013
actual:  557 

sentence 210:
predicted:  7
actual:  42 

sentence 211:
predicted:  12
actual:  6 

sentence 212:
predicted:  4
actual:  54 

sentence 213:
predicted:  1
actual:  14 

sentence 214:
predicted:  1
actual:  566 

sentence 215:
predicted:  4
actual:  5 

sentence 216:
predicted:  1
actual:  1 

sentence 217:
predicted:  5
actual:  2731 

sentence 218:
predicted:  6990
actual:  6990 

sentence 219:
predicted:  1
actual:  4106 

sentence 220:
predicted:  1
actual:  1 

sentence 221:
predicted:  7
actual:  171 

sentence 222:
predicted:  3
actual:  3 

senten

sentence 392:
predicted:  4
actual:  5 

sentence 393:
predicted:  13
actual:  40 

sentence 394:
predicted:  1
actual:  2 

sentence 395:
predicted:  10
actual:  10 

sentence 396:
predicted:  7
actual:  4 

sentence 397:
predicted:  4
actual:  4 

sentence 398:
predicted:  2
actual:  2600 

sentence 399:
predicted:  22
actual:  22 

sentence 400:
predicted:  2091
actual:  2091 

sentence 401:
predicted:  1
actual:  1 

sentence 402:
predicted:  15
actual:  5 

sentence 403:
predicted:  51
actual:  44 

sentence 404:
predicted:  9
actual:  20 

sentence 405:
predicted:  1
actual:  3 

sentence 406:
predicted:  4
actual:  4 

sentence 407:
predicted:  32
actual:  32 

sentence 408:
predicted:  9
actual:  1934 

sentence 409:
predicted:  1
actual:  112 

sentence 410:
predicted:  79
actual:  1294 

sentence 411:
predicted:  10
actual:  66 

sentence 412:
predicted:  16
actual:  41 

sentence 413:
predicted:  1
actual:  1 

sentence 414:
predicted:  8
actual:  8 

sentence 415:
predicted

sentence 587:
predicted:  2
actual:  2 

sentence 588:
predicted:  116
actual:  606 

sentence 589:
predicted:  9
actual:  9 

sentence 590:
predicted:  6
actual:  36 

sentence 591:
predicted:  109
actual:  1785 

sentence 592:
predicted:  5
actual:  5 

sentence 593:
predicted:  26
actual:  26 

sentence 594:
predicted:  50
actual:  105 

sentence 595:
predicted:  6
actual:  6 

sentence 596:
predicted:  54
actual:  3963 

sentence 597:
predicted:  8
actual:  8 

sentence 598:
predicted:  3
actual:  24 

sentence 599:
predicted:  9
actual:  103 

sentence 600:
predicted:  7
actual:  21 

sentence 601:
predicted:  10
actual:  16 

sentence 602:
predicted:  39
actual:  682 

sentence 603:
predicted:  22
actual:  1664 

sentence 604:
predicted:  3
actual:  40 

sentence 605:
predicted:  375
actual:  4 

sentence 606:
predicted:  13
actual:  8 

sentence 607:
predicted:  210
actual:  87 

sentence 608:
predicted:  7
actual:  68 

sentence 609:
predicted:  9936
actual:  9936 

sentence 61

sentence 780:
predicted:  3
actual:  286 

sentence 781:
predicted:  9
actual:  862 

sentence 782:
predicted:  7
actual:  7 

sentence 783:
predicted:  4
actual:  104 

sentence 784:
predicted:  226
actual:  447 

sentence 785:
predicted:  9
actual:  9 

sentence 786:
predicted:  9
actual:  661 

sentence 787:
predicted:  13
actual:  123 

sentence 788:
predicted:  1
actual:  14 

sentence 789:
predicted:  1
actual:  14 

sentence 790:
predicted:  19
actual:  243 

sentence 791:
predicted:  6
actual:  6 

sentence 792:
predicted:  4
actual:  35 

sentence 793:
predicted:  1
actual:  2 

sentence 794:
predicted:  4
actual:  104 

sentence 795:
predicted:  19
actual:  34 

sentence 796:
predicted:  54
actual:  85 

sentence 797:
predicted:  13
actual:  22 

sentence 798:
predicted:  4
actual:  272 

sentence 799:
predicted:  9
actual:  49 

sentence 800:
predicted:  3
actual:  3 

sentence 801:
predicted:  1
actual:  448 

sentence 802:
predicted:  13
actual:  50 

sentence 803:
predict

sentence 976:
predicted:  7
actual:  1 

sentence 977:
predicted:  3
actual:  97 

sentence 978:
predicted:  73
actual:  5260 

sentence 979:
predicted:  4
actual:  1697 

sentence 980:
predicted:  106
actual:  2792 

sentence 981:
predicted:  1
actual:  1 

sentence 982:
predicted:  1052
actual:  2304 

sentence 983:
predicted:  2
actual:  2 

sentence 984:
predicted:  5
actual:  4 

sentence 985:
predicted:  3
actual:  3 

sentence 986:
predicted:  6
actual:  12 

sentence 987:
predicted:  4
actual:  27 

sentence 988:
predicted:  3
actual:  86 

sentence 989:
predicted:  4
actual:  3 

sentence 990:
predicted:  54
actual:  4700 

sentence 991:
predicted:  3
actual:  233 

sentence 992:
predicted:  9
actual:  1168 

sentence 993:
predicted:  1
actual:  1 

sentence 994:
predicted:  6
actual:  23 

sentence 995:
predicted:  13
actual:  1 

sentence 996:
predicted:  3
actual:  6 

sentence 997:
predicted:  7
actual:  7 

sentence 998:
predicted:  8
actual:  83 

sentence 999:
predicted

numpy.int64

In [203]:
loaded_seq = np.load("./C_test_set/org_seq.npy")
loaded_rnn_gen = np.load("./C_test_set/rnn_result.npy")
loaded_actual = np.load("./C_test_set/actual_result.npy")

loaded_seq.shape
loaded_rnn_gen.shape
loaded_actual.shape

type(loaded_seq[0][0])
type(loaded_rnn_gen[0])
type(loaded_actual[0])

(1000, 50)

(1000,)

(1000,)

numpy.int64

numpy.int64

numpy.int64