In [1]:
# %load_ext autoreload
# %autoreload 2

# from IPython.core.interactiveshell import InteractiveShell
# from IPython.display import HTML

# InteractiveShell.ast_node_interactivity = 'all'

# from keras import Model
# from keras.models import load_model
# import warnings
# warnings.filterwarnings('ignore', category = RuntimeWarning)
# warnings.filterwarnings('ignore', category = UserWarning)

# BATCH_SIZE = 2048
# RANDOM_STATE = 50

# import numpy as np
# import pandas as pd
# from utils import get_model, find_closest, get_sequences, create_train_valid,  generate_output, guess_human

In [4]:
# Set up IPython to show all outputs from a cell
import warnings
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

warnings.filterwarnings('ignore', category=RuntimeWarning)

RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
VERBOSE = 0
SAVE_MODEL = True
RNN_CELLS = 128

import pandas as pd
import numpy as np

# Read in data
data = pd.read_csv(
    '../data/neural_network_patent_query.csv', parse_dates=['patent_date'])

# Extract abstracts
original_abstracts = list(data['patent_abstract'])

In [5]:
def make_sequences(texts,
                   training_length=50,
                   lower=True,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)

    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []

    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length:i + 1]

            # Set the features and label
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    print(f'There are {len(training_seq)} training sequences.')

    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels

In [6]:
from keras.preprocessing.text import Tokenizer

example = 'This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have an image and has two commas.'
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)

Using TensorFlow backend.


'this is a short sentence 1 with one reference to an image this next sentence while non sensical does not have an image and has two commas'

In [7]:
tokenizer = Tokenizer(filters='"#$%&*+/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)
tokenizer.word_index.keys()

'this is a short sentence (1) with one reference to an image. this next sentence, while non-sensical, does not have an image and has two commas.'

dict_keys(['this', 'an', 'is', 'a', 'short', 'sentence', '(1)', 'with', 'one', 'reference', 'to', 'image.', 'next', 'sentence,', 'while', 'non-sensical,', 'does', 'not', 'have', 'image', 'and', 'has', 'two', 'commas.'])

In [8]:
import re


def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', patent)

    # Remove references to figures
    patent = re.sub(r'\((\d+)\)', r'', patent)

    # Remove double spaces
    patent = re.sub(r'\s\s', ' ', patent)
    return patent


f = format_patent(example)
f

'This is a short sentence with one reference to an image . This next sentence , while non-sensical , does not have an image and has two commas .'

In [9]:
formatted = []

# Iterate through all the original abstracts
for a in original_abstracts:
    formatted.append(format_patent(a))

len(formatted)

3522

In [10]:
TRAINING_LENGTH = 50

filters = '!"%;[\\]^_`{|}~\t\n'
word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels = make_sequences(
    formatted, TRAINING_LENGTH, lower=False, filters=filters)

There are 16192 unique words.
There are 318563 training sequences.


In [11]:
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:15]

[('the', 30760),
 ('a', 21442),
 ('of', 20157),
 ('.', 16554),
 (',', 15415),
 ('and', 12563),
 ('to', 12012),
 ('network', 7618),
 ('neural', 7235),
 ('is', 7211),
 ('for', 6779),
 ('in', 6131),
 ('The', 5813),
 ('an', 5286),
 ('data', 3971)]

In [12]:
from sklearn.utils import shuffle


def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=TRAIN_FRACTION):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc
    gc.enable()
    del features, labels, train_features, valid_features, train_labels, valid_labels
    gc.collect()

    return X_train, X_valid, y_train, y_valid

In [13]:
import os
from keras.utils import get_file

# Vectors to use
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.zip'

# Download word embeddings if they are not present
if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip',
                             'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')

# Load in unzipped file
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape
glove[0]

(400000, 101)

array(['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172',
       '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459',
       '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231',
       '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336',
       '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971',
       '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722',
       '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397',
       '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531',
       '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477',
       '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205',
       '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167',
       '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079',
       '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044',
       '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972',
       '0.15006', '-0.53

In [14]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

del glove

vectors[100], words[100]

(array([-3.9551e-01,  5.4660e-01,  5.0315e-01, -6.3682e-01, -4.5470e-01,
         3.0889e-01, -4.9240e-02,  2.7191e-01,  3.1562e-01, -3.2879e-01,
         2.5089e-01,  1.4508e-01,  3.5136e-01, -2.2793e-01, -1.5894e-01,
        -5.1527e-01, -2.7978e-01,  3.6470e-01, -3.9425e-01,  3.3299e-01,
         4.3051e-01,  1.8300e-01,  2.5095e-01, -1.8547e-01,  3.4698e-01,
         5.5137e-02, -4.5979e-01, -8.2963e-01, -1.8523e-02, -3.6772e-01,
         4.5566e-02,  7.1052e-01, -2.2782e-02, -8.0889e-02,  2.0685e-01,
         4.9855e-01, -5.9794e-02, -8.0048e-03, -2.3823e-01, -3.3759e-01,
        -2.4201e-01, -2.3788e-01, -1.1362e-03, -4.0395e-01, -4.4859e-01,
        -3.2189e-01,  4.8405e-01, -2.7999e-02,  1.0148e-01, -9.3585e-01,
        -8.7522e-02, -3.9959e-01,  3.6545e-01,  1.3726e+00, -3.0713e-01,
        -2.5940e+00,  2.2431e-01, -4.1168e-02,  1.7765e+00,  4.0010e-01,
        -1.0996e-01,  1.4178e+00, -2.6154e-01,  1.8617e-01,  7.9328e-01,
        -1.1709e-01,  8.7541e-01,  4.3911e-01,  3.4

In [15]:
word_idx

{'the': 1,
 'a': 2,
 'of': 3,
 '.': 4,
 ',': 5,
 'and': 6,
 'to': 7,
 'network': 8,
 'neural': 9,
 'is': 10,
 'for': 11,
 'in': 12,
 'The': 13,
 'an': 14,
 'data': 15,
 'are': 16,
 'by': 17,
 'A': 18,
 'input': 19,
 'system': 20,
 'with': 21,
 'output': 22,
 'or': 23,
 'from': 24,
 'which': 25,
 'be': 26,
 'as': 27,
 'that': 28,
 'on': 29,
 'signal': 30,
 'method': 31,
 'each': 32,
 'one': 33,
 'image': 34,
 'at': 35,
 'using': 36,
 'plurality': 37,
 'first': 38,
 'layer': 39,
 'can': 40,
 'includes': 41,
 'set': 42,
 'training': 43,
 'control': 44,
 'may': 45,
 'processing': 46,
 'based': 47,
 'values': 48,
 'second': 49,
 'information': 50,
 'signals': 51,
 'model': 52,
 'value': 53,
 'used': 54,
 'process': 55,
 'learning': 56,
 'neuron': 57,
 'least': 58,
 'pattern': 59,
 'such': 60,
 'device': 61,
 'between': 62,
 'time': 63,
 'In': 64,
 'circuit': 65,
 'vector': 66,
 'unit': 67,
 'having': 68,
 'into': 69,
 'neurons': 70,
 'apparatus': 71,
 'function': 72,
 'trained': 73,
 'more'

In [16]:
word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

There were 6317 words without pre-trained embeddings.


In [17]:
embedding_matrix = np.zeros((num_words, len(word_lookup['the'])))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')
embedding_matrix.shape

There were 6317 words without pre-trained embeddings.


(16192, 100)

In [21]:
embedding_matrix[1]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [14]:
# Split into training and validation
X_train, X_valid, y_train, y_valid = create_train_valid(
    features, labels, num_words)
X_train.shape, y_train.shape

((222994, 50), (222994, 16192))

In [15]:
import sys
sys.getsizeof(y_train) / 1e9

def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')

check_sizes(gb_min=1)

3.61071896

Object: y_train   	Size: 3.61071896 GB.
Object: y_valid   	Size: 1.54745336 GB.


In [16]:
# model
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional, SimpleRNN
from keras.optimizers import Adam

from keras.utils import plot_model

def make_word_level_model(num_words,
                          embedding_matrix,
                          rnn_cells=128,
                          trainable=False,
                          rnn_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of RNN cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple RNN layers
    if rnn_layers > 1:
        for i in range(rnn_layers - 1):
            model.add(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final RNN cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            SimpleRNN(
                rnn_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
#     model.add(Dense(128, activation='relu'))
#     # Dropout for regularization
#     model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model


model = make_word_level_model(
    num_words,
    embedding_matrix=embedding_matrix,
    rnn_cells=RNN_CELLS,
    trainable=False,
    rnn_layers=1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
masking_1 (Masking)          (None, None, 100)         0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               29312     
_________________________________________________________________
dense_1 (Dense)              (None, 16192)             2088768   
Total params: 3,737,280
Trainable params: 2,118,080
Non-trainable params: 1,619,200
_________________________________________________________________


In [138]:
from keras.models import Model

model_name = 'train-embeddings-rnn-50'
model_dir = '../my_models/'
model = load_model(f'{model_dir}{model_name}.h5')

seed = [633, 5033, 21, 1, 4117, 30, 633, 3770, 12, 1, 22, 122, 4, 5478, 1, 44, 3, 25, 5696, 
 30, 5, 1, 4117, 30, 633, 4039, 7, 1, 22, 122, 3, 1, 810, 216, 8321, 10, 3188, 17, 14,
 2829, 741, 5, 366, 1, 4117, 30, 633, 182, 4039, 7]
# seed = [1, 2]
# layer_name = 'simple_rnn_1'
layer_name = 'simple_rnn_1'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
simple_rnn_output = intermediate_layer_model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float32)

print(simple_rnn_output.shape, '\n', simple_rnn_output)
# print(np.argmax(intermediate_output))

(128,) 
 [-0.99484414 -0.9985863   0.96538794 -0.9971069  -0.9999969   0.9998633
 -0.9549347   0.99972117  0.8981451  -0.99784255 -0.99484724 -0.9999686
 -0.9953384  -0.9528251   0.9995063  -0.9999277  -0.41723654  0.51935804
 -0.99759305 -0.99999666 -0.99999994 -0.9396985   0.9992409   0.84939003
 -0.28030723  0.95485187  0.9963598  -0.910965    0.99997616  0.96737957
 -0.8765117  -0.18591505 -0.8105124   0.9989246  -0.98506683  0.9999999
  0.34563345  0.95964724 -0.99181044  0.9995369   0.89904433 -0.98382765
 -0.6158973  -0.9961397  -0.9717155  -0.9474682  -0.7410803   0.9976271
  0.99999815 -0.99999994 -0.9993072  -0.99999565  0.99988395  0.9999617
 -0.99910665  0.9999991   0.99502623  0.9902567   0.99800503 -0.999994
  0.9992447   0.473282    0.9999519  -0.9629989  -0.9557911  -0.9999992
  0.9974116   0.87106574  0.54272527  0.99955606 -0.99954623 -0.54515016
 -0.8131893   0.85693735  0.99918973  0.9998866  -0.9999965   0.99999976
 -0.9470251  -0.9999879  -0.9999829   0.9966791  -

In [126]:
embedding_weights = np.array(model.layers[0].get_weights()).reshape(16192,100)
simple_rnn_weights = model.layers[1].get_weights()

In [122]:
embedding_weights[0].shape

simple_rnn_weights[0].shape
simple_rnn_weights[1].shape
simple_rnn_weights[2].shape

kernel = simple_rnn_weights[0]
recurrent_kernel = simple_rnn_weights[1]
bias = simple_rnn_weights[2]

(16192, 100)

(100, 128)

(128, 128)

(128,)

In [131]:
seed = [1, 2]
rnn_last_state = np.zeros(128)

for i in range(len(seed)):
    rnn_input = np.array(embedding_weights[seed[i]]).reshape(1,-1)
    # np.dot(rnn_last_state, recurrent_kernel)
    # np.dot(rnn_input, kernel) + np.dot(rnn_last_state, recurrent_kernel) + bias
    if i == 1:
#         np.dot(rnn_input, kernel)
        np.dot(rnn_last_state, recurrent_kernel)
    rnn_output = np.tanh(np.dot(rnn_input, kernel) + np.dot(rnn_last_state, recurrent_kernel) + bias)
#     rnn_output
    rnn_last_state = rnn_output
    
rnn_output

array([[-0.19871197,  0.41745028, -0.34474139, -1.10971409, -2.55533149,
         1.42803591,  0.6097176 ,  0.85808208, -1.38965742, -0.01855767,
        -1.26566144, -2.19317695,  2.55040328, -2.25840425, -0.5859573 ,
         0.02375277,  3.87210029, -1.5169179 , -3.6983877 , -0.23823805,
         3.5074762 ,  1.47142156,  0.0406416 ,  3.4254615 ,  0.47967848,
         0.37693355, -1.03405159, -2.46427691, -0.74720811,  0.75434341,
         0.15075813,  0.91013129, -1.62865896,  2.84496137, -0.14714996,
         1.18228433, -3.03601694,  0.72553252,  0.03145215, -0.62544878,
         0.3962952 ,  0.79409724,  0.77353565,  3.21586471, -1.43777011,
         0.96422094,  4.29959125, -0.35596659, -3.08871807,  2.0995158 ,
        -3.00105864,  0.67080632, -4.75841168,  0.81628265, -7.11455973,
         2.26776   ,  1.48866826,  2.0494036 , -0.93816066, -2.14617125,
         0.18780426, -0.355271  ,  1.53457386,  0.40104174,  0.92521768,
         2.3397819 , -1.24999651,  2.7249429 , -0.1

array([[ 0.9982106 , -0.96867131,  0.99517488,  0.99977043, -0.99832897,
        -0.96210655,  0.99999906,  0.99999895, -0.99037884, -0.99403073,
        -0.99999986, -0.99303087,  0.27003161, -0.99994589,  0.73167099,
         0.99997336,  0.99822564,  0.79761757, -0.31039834, -0.58589771,
        -0.99067353,  0.98561041, -0.99656653,  0.99927735,  0.2544966 ,
         0.6042112 ,  0.99967793, -0.9488793 ,  0.83519773, -0.98701102,
        -0.99996086,  0.94367716,  0.95175827,  0.99497095, -0.99292456,
         0.99907122, -0.96776293,  0.9854117 , -0.9988922 , -0.91344262,
        -0.3595444 ,  0.83359565, -0.9970186 , -0.49418385, -0.4116497 ,
         0.76927467,  0.99611275,  0.99968999, -0.99999999, -0.36236135,
        -0.99998713,  0.97511155, -0.99999999,  0.95130718, -0.99999455,
         0.99955052,  0.98907375,  0.99999993,  0.97654438, -0.9968783 ,
        -0.99988171,  0.7118891 , -0.99611497,  0.99861087, -0.96849475,
         1.        , -0.9707406 ,  0.99433452,  0.9

In [139]:
from keras.models import Model

model_name = 'train-embeddings-rnn-50'
model_dir = '../my_models/'
model = load_model(f'{model_dir}{model_name}.h5')

seed = [633, 5033, 21, 1, 4117, 30, 633, 3770, 12, 1, 22, 122, 4, 5478, 1, 44, 3, 25, 5696, 
 30, 5, 1, 4117, 30, 633, 4039, 7, 1, 22, 122, 3, 1, 810, 216, 8321, 10, 3188, 17, 14,
 2829, 741, 5, 366, 1, 4117, 30, 633, 182, 4039, 7]
# layer_name = 'simple_rnn_1'
layer_name = 'dense_1'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
fc_output = intermediate_layer_model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float32)

print(fc_output.shape, '\n', fc_output)
print(np.argmax(fc_output))

(16192,) 
 [6.7905109e-10 5.5151570e-01 2.0365700e-01 ... 5.2167232e-10 1.0485386e-08
 4.9038400e-09]
1


In [133]:
fc_weights = model.layers[2].get_weights()
fc_weights[0].shape
fc_weights[1].shape
fc_kernel = fc_weights[0]
fc_bias = fc_weights[1]

(128, 16192)

(16192,)

In [134]:
def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

In [137]:
fc_input = rnn_output
fc_input.shape
result = softmax(np.dot(fc_input, fc_kernel) + fc_bias)
result.shape
np.argmax(result)

(1, 128)

(1, 16192)

9

In [17]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

BATCH_SIZE = 2048
model_name = 'train-embeddings-rnn-50'
model_dir = '../my_models/'

def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [18]:
def load_and_evaluate(model_name, return_model=False):
    """Load in a trained model and evaluate with log loss and accuracy"""

    model = load_model(f'{model_dir}{model_name}.h5')
    r = model.evaluate(X_valid, y_valid, batch_size=2048, verbose=1)

    valid_crossentropy = r[0]
    valid_accuracy = r[1]

    print(f'Cross Entropy: {round(valid_crossentropy, 4)}')
    print(f'Accuracy: {round(100 * valid_accuracy, 2)}%')

    if return_model:
        return model

In [19]:
model = load_and_evaluate(model_name, return_model=True)

Cross Entropy: 4.6402
Accuracy: 28.39%


In [20]:
from IPython.display import HTML


def header(text, color='black'):
    raw_html = f'<h1 style="color: {color};"><center>' + \
        str(text) + '</center></h1>'
    return raw_html


def box(text):
    raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + \
        str(text)+'</div>'
    return raw_html


def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

In [21]:
import random


def generate_output(model,
                    sequences,
                    training_length=50,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Choose a random sequence
    seq = random.choice(sequences)

    # Choose a random starting point
    seed_idx = random.randint(0, len(seq) - training_length - 10)
    # Ending index for seed
    end_idx = seed_idx + training_length

    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
        generated = seed[:] + ['#']

        # Find the actual entire sequence
        actual = generated[:] + seq[end_idx:end_idx + new_words]

        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)

            # Diversify
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]

            next_idx = np.argmax(probas)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        return original_sequence, gen_list, a

    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [22]:
def remove_spaces(patent):
    """Remove spaces around punctuation"""
    patent = re.sub(r'\s+([.,;?])', r'\1', patent)

    return patent

In [23]:
seed_html, gen_html, a_html = generate_output(model, sequences,
                                              TRAINING_LENGTH)
HTML(seed_html)
HTML(gen_html)
HTML(a_html)

In [24]:
import random


def generate_next(model,
                    sequence,
                    ans,
                    training_length=50,
                    new_words=1,
                    diversity=1,
                    return_output=False,
                    return_idx=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""
    """
    sequence: a tokenized sentence, e.g.[102,3,2314,3,...], with a length of training_length
    ans: the real next word
    """
    # Choose a random sequence
    seq = list(sequence)
    seed_idx = 0
    end_idx = training_length
    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
#         print(seed.shape, type(seed), type(['#']))
        generated = seed[:] + ['#']

        # Find the actual entire sequence
#         actual = generated[:] + seq[end_idx:end_idx + new_words]
        actual = generated[:] + [ans]
    
        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)
#             print(preds.shape, "\n", preds)
            # Diversify
#             preds = np.log(preds) / diversity
#             exp_preds = np.exp(preds)
#             print(exp_preds.shape, exp_preds)

            # Softmax
#             preds = exp_preds / sum(exp_preds)

            # Choose the next word
#             probas = np.random.multinomial(1, preds, 1)[0]

#             next_idx = np.argmax(probas)
            next_idx = np.argmax(preds)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        if return_idx == False:
            return original_sequence, gen_list, a
        else:
#             print(original_sequence, gen_list, a)
#             print(original_sequence[0], gen_list[0][1], a[1])
            return [word_idx[ele] for ele in original_sequence], word_idx[gen_list[0][1]], word_idx[a[1]]
    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [25]:
X_valid.shape
y_valid.shape

(95569, 50)

(95569, 16192)

In [26]:
X_valid[0]

array([ 633, 5033,   21,    1, 4117,   30,  633, 3770,   12,    1,   22,
        122,    4, 5478,    1,   44,    3,   25, 5696,   30,    5,    1,
       4117,   30,  633, 4039,    7,    1,   22,  122,    3,    1,  810,
        216, 8321,   10, 3188,   17,   14, 2829,  741,    5,  366,    1,
       4117,   30,  633,  182, 4039,    7])

In [27]:


for i in range(0, 10):
    # turn one-hot to index1
    # print(np.where(y_valid[1110]==1))
    seed_html, gen_html, a_html = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1)
    HTML(seed_html)
    HTML(gen_html)
    HTML(a_html)

    seed, rnn_generate, actual = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1, return_output=True, return_idx=True)
    print("seed:\n", seed)
    print("predicted: ", rnn_generate)
    print("actual: ", actual)

seed:
 [633, 5033, 21, 1, 4117, 30, 633, 3770, 12, 1, 22, 122, 4, 5478, 1, 44, 3, 25, 5696, 30, 5, 1, 4117, 30, 633, 4039, 7, 1, 22, 122, 3, 1, 810, 216, 8321, 10, 3188, 17, 14, 2829, 741, 5, 366, 1, 4117, 30, 633, 182, 4039, 7]
predicted:  1
actual:  1


seed:
 [21, 609, 514, 1, 124, 16, 1276, 7, 360, 399, 250, 154, 320, 2, 752, 15, 42, 24, 1, 92, 10, 663, 4, 13, 20, 10, 1015, 6, 1273, 5, 425, 342, 46, 1220, 7, 1, 3468, 4911, 3, 9, 116, 151, 28, 1, 34, 450, 3165, 1, 622, 3]
predicted:  1
actual:  15452


seed:
 [8, 5, 95, 33, 23, 74, 3, 6097, 4526, 1, 87, 3, 70, 12, 2, 185, 39, 6097, 5060, 1, 87, 3, 185, 190, 6097, 6765, 1, 222, 3, 661, 213, 12, 1, 190, 6, 6097, 9712, 1, 121, 1012, 1, 70, 1715, 5754, 1045, 1, 82, 9, 8, 10]
predicted:  73
actual:  73


seed:
 [12, 33, 3, 678, 1487, 24, 1, 351, 89, 4, 2198, 2178, 245, 89, 5, 1, 52, 1544, 4647, 1, 92, 7, 2129, 12, 33, 3, 678, 1487, 4, 467, 1286, 3929, 635, 7260, 2367, 1, 92, 84, 1791, 2, 739, 17, 2870, 638, 367, 35, 2, 89, 181, 84]
predicted:  257
actual:  257


seed:
 [6, 245, 42, 3, 277, 773, 27, 182, 497, 5, 1, 134, 441, 3, 50, 332, 4, 149, 962, 3, 2631, 10, 2428, 17, 432, 613, 129, 7, 1, 277, 23, 43, 15, 4, 7078, 1117, 276, 16, 88, 153, 5, 1544, 36, 2, 2907, 43, 2529, 5, 11, 1249]
predicted:  1805
actual:  1805


seed:
 [1, 1272, 30, 4, 13, 374, 41, 2, 2802, 708, 25, 1396, 1, 132, 51, 69, 2, 2802, 34, 281, 12064, 4, 64, 2, 609, 166, 5, 1, 374, 40, 142, 2, 713, 764, 2733, 12065, 11, 494, 2802, 2890, 50, 24, 1, 12066, 12, 1, 165, 3, 80, 42]
predicted:  5
actual:  534


seed:
 [1820, 121, 7, 51, 29, 1, 322, 15, 22, 443, 12, 1, 79, 244, 6, 538, 1, 305, 51, 5, 6, 14, 547, 65, 11, 538, 2, 779, 484, 3, 1, 103, 3, 455, 307, 116, 4, 13, 322, 15, 22, 443, 16, 546, 7, 165, 681, 6, 1, 547]
predicted:  65
actual:  65


seed:
 [4479, 7, 2, 256, 1029, 1, 2165, 256, 21, 2, 376, 1152, 890, 4, 13, 252, 35, 25, 1, 5872, 256, 12664, 7, 1, 256, 3, 1, 2165, 10, 231, 4, 149, 605, 2627, 3, 1, 5872, 10, 423, 754, 17, 2, 1152, 3660, 29, 1, 5872, 1222, 4, 1509]
predicted:  1
actual:  3


seed:
 [739, 129, 303, 27, 335, 27, 17, 892, 5615, 4, 76, 82, 9, 8, 802, 10, 73, 17, 2083, 1, 382, 48, 6, 1, 3568, 8876, 48, 4, 64, 2, 839, 676, 5, 3845, 231, 48, 16, 513, 4, 18, 615, 3, 2676, 105, 48, 10, 54, 11, 43, 1]
predicted:  9
actual:  439


seed:
 [156, 1474, 11, 2, 59, 315, 10, 136, 8723, 12, 1, 9, 8, 12, 157, 2122, 4, 1443, 5, 181, 8724, 1222, 27, 33, 23, 74, 436, 3, 2111, 5, 32, 3, 25, 555, 2, 8725, 1160, 6933, 3, 1, 59, 315, 4, 4295, 5, 1, 568, 35, 32, 225]
predicted:  3
actual:  10


In [28]:
# save results

org_seq = []
rnn_result = []
actual_result= []

for i in range(0, 1000):

    seed, rnn_generate, actual = generate_next(model, X_valid[i], np.where(y_valid[i]==1)[0][0],
                                                  training_length=TRAINING_LENGTH, new_words=1, return_output=True, return_idx=True)
    print("sentence {}:".format(i) )
#     print("seed:\n", seed)
    print("predicted: ", rnn_generate)
    print("actual: ", actual, '\n')
    org_seq.append(seed)
    rnn_result.append(rnn_generate)
    actual_result.append(actual)
    

org_seq = np.array(org_seq, dtype=int)
rnn_result = np.array(rnn_result, dtype=int)
actual_result = np.array(actual_result, dtype=int)

type(org_seq[0][0])

# np.savetxt("./C_test_set/org_seq.txt", org_seq)
# np.savetxt("./C_test_set/rnn_result.txt", rnn_result)
# np.savetxt("./C_test_set/actual_result.txt", actual_result)

np.save("./C_test_set/org_seq", org_seq)
np.save("./C_test_set/rnn_result", rnn_result)
np.save("./C_test_set/actual_result", actual_result)

sentence 0:
predicted:  1
actual:  1 

sentence 1:
predicted:  1
actual:  15452 

sentence 2:
predicted:  73
actual:  73 

sentence 3:
predicted:  257
actual:  257 

sentence 4:
predicted:  1805
actual:  1805 

sentence 5:
predicted:  5
actual:  534 

sentence 6:
predicted:  65
actual:  65 

sentence 7:
predicted:  1
actual:  3 

sentence 8:
predicted:  9
actual:  439 

sentence 9:
predicted:  3
actual:  10 

sentence 10:
predicted:  143
actual:  479 

sentence 11:
predicted:  6
actual:  12 

sentence 12:
predicted:  3
actual:  3 

sentence 13:
predicted:  198
actual:  538 

sentence 14:
predicted:  53
actual:  145 

sentence 15:
predicted:  39
actual:  8 

sentence 16:
predicted:  7
actual:  29 

sentence 17:
predicted:  333
actual:  333 

sentence 18:
predicted:  9
actual:  611 

sentence 19:
predicted:  1
actual:  99 

sentence 20:
predicted:  1
actual:  1 

sentence 21:
predicted:  16
actual:  530 

sentence 22:
predicted:  1
actual:  143 

sentence 23:
predicted:  2637
actual:  45

sentence 213:
predicted:  1
actual:  14 

sentence 214:
predicted:  1
actual:  566 

sentence 215:
predicted:  4
actual:  5 

sentence 216:
predicted:  1
actual:  1 

sentence 217:
predicted:  5
actual:  2731 

sentence 218:
predicted:  6990
actual:  6990 

sentence 219:
predicted:  1
actual:  4106 

sentence 220:
predicted:  1
actual:  1 

sentence 221:
predicted:  7
actual:  171 

sentence 222:
predicted:  11
actual:  3 

sentence 223:
predicted:  9
actual:  2259 

sentence 224:
predicted:  257
actual:  2 

sentence 225:
predicted:  5
actual:  51 

sentence 226:
predicted:  1
actual:  14 

sentence 227:
predicted:  4
actual:  10 

sentence 228:
predicted:  1
actual:  1 

sentence 229:
predicted:  9
actual:  2016 

sentence 230:
predicted:  6
actual:  6 

sentence 231:
predicted:  17
actual:  29 

sentence 232:
predicted:  6
actual:  6 

sentence 233:
predicted:  5
actual:  5 

sentence 234:
predicted:  54
actual:  284 

sentence 235:
predicted:  62
actual:  62 

sentence 236:
predict

sentence 422:
predicted:  5
actual:  5 

sentence 423:
predicted:  41
actual:  1511 

sentence 424:
predicted:  147
actual:  147 

sentence 425:
predicted:  257
actual:  189 

sentence 426:
predicted:  1
actual:  9 

sentence 427:
predicted:  54
actual:  78 

sentence 428:
predicted:  22
actual:  1206 

sentence 429:
predicted:  258
actual:  258 

sentence 430:
predicted:  9
actual:  114 

sentence 431:
predicted:  29
actual:  29 

sentence 432:
predicted:  166
actual:  166 

sentence 433:
predicted:  2263
actual:  17 

sentence 434:
predicted:  4
actual:  161 

sentence 435:
predicted:  4
actual:  5 

sentence 436:
predicted:  796
actual:  9 

sentence 437:
predicted:  21
actual:  265 

sentence 438:
predicted:  7
actual:  7 

sentence 439:
predicted:  6
actual:  6 

sentence 440:
predicted:  70
actual:  277 

sentence 441:
predicted:  5
actual:  4 

sentence 442:
predicted:  6
actual:  24 

sentence 443:
predicted:  6
actual:  1660 

sentence 444:
predicted:  3
actual:  30 

sentence

sentence 624:
predicted:  17
actual:  11 

sentence 625:
predicted:  17
actual:  161 

sentence 626:
predicted:  34
actual:  4571 

sentence 627:
predicted:  51
actual:  48 

sentence 628:
predicted:  54
actual:  3 

sentence 629:
predicted:  2
actual:  1106 

sentence 630:
predicted:  48
actual:  89 

sentence 631:
predicted:  11
actual:  12 

sentence 632:
predicted:  1
actual:  453 

sentence 633:
predicted:  9
actual:  2161 

sentence 634:
predicted:  1
actual:  1114 

sentence 635:
predicted:  2
actual:  15 

sentence 636:
predicted:  2
actual:  14 

sentence 637:
predicted:  23
actual:  15621 

sentence 638:
predicted:  54
actual:  111 

sentence 639:
predicted:  1
actual:  567 

sentence 640:
predicted:  239
actual:  1534 

sentence 641:
predicted:  1
actual:  9 

sentence 642:
predicted:  144
actual:  45 

sentence 643:
predicted:  1
actual:  2 

sentence 644:
predicted:  10
actual:  84 

sentence 645:
predicted:  2
actual:  2 

sentence 646:
predicted:  72
actual:  1705 

sent

sentence 827:
predicted:  9
actual:  1915 

sentence 828:
predicted:  1
actual:  235 

sentence 829:
predicted:  26
actual:  26 

sentence 830:
predicted:  11
actual:  3 

sentence 831:
predicted:  1
actual:  2 

sentence 832:
predicted:  17
actual:  12 

sentence 833:
predicted:  9
actual:  117 

sentence 834:
predicted:  4
actual:  5 

sentence 835:
predicted:  9
actual:  131 

sentence 836:
predicted:  4
actual:  6 

sentence 837:
predicted:  495
actual:  39 

sentence 838:
predicted:  495
actual:  3 

sentence 839:
predicted:  144
actual:  679 

sentence 840:
predicted:  4
actual:  1367 

sentence 841:
predicted:  4
actual:  367 

sentence 842:
predicted:  1
actual:  32 

sentence 843:
predicted:  6
actual:  73 

sentence 844:
predicted:  2
actual:  146 

sentence 845:
predicted:  47
actual:  4 

sentence 846:
predicted:  21
actual:  5 

sentence 847:
predicted:  22
actual:  5054 

sentence 848:
predicted:  1
actual:  1027 

sentence 849:
predicted:  72
actual:  443 

sentence 850:

numpy.int64

In [29]:
loaded_seq = np.load("./C_test_set/org_seq.npy")
loaded_rnn_gen = np.load("./C_test_set/rnn_result.npy")
loaded_actual = np.load("./C_test_set/actual_result.npy")

loaded_seq.shape
loaded_rnn_gen.shape
loaded_actual.shape

type(loaded_seq[0][0])
type(loaded_rnn_gen[0])
type(loaded_actual[0])

(1000, 50)

(1000,)

(1000,)

numpy.int64

numpy.int64

numpy.int64