In [1]:
import pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer

abstracts = list(pd.read_csv('../data/neural_network_patent_query.csv')['patent_abstract'])


Using TensorFlow backend.


In [3]:
# Abstracts is a list of strings
abstracts[100][:300]

'The present invention provides an apparatus and a method for classifying and recognizing image patterns using a second-order neural network, thereby achieving high-rate parallel processing while lowering the complexity. The second-order neural network, which is made of adders and multipliers, correc'

In [4]:
# Create Tokenizer Object
tokenizer = Tokenizer(num_words=None, 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                     lower = True, split = ' ')

# Train the tokenizer to the texts
tokenizer.fit_on_texts(abstracts)

# Convert list of strings into list of lists of integers
sequences = tokenizer.texts_to_sequences(abstracts)

sequences[100][:15]

[1, 88, 71, 130, 11, 60, 4, 2, 29, 10, 586, 4, 583, 30, 129]

In [5]:
# Mapping of indexes to words
idx_word = tokenizer.index_word

' '.join(idx_word[w] for w in sequences[100][:40])

'the present invention provides an apparatus and a method for classifying and recognizing image patterns using a second order neural network thereby achieving high rate parallel processing while lowering the complexity the second order neural network which is made of'

In [6]:
word_idx = tokenizer.word_index

In [7]:
features = []
labels = []

training_length = 50

# Iterate through the sequences of tokens
for seq in sequences:

    # Create multiple training examples from each sequence
    for i in range(training_length, len(seq)):
        
        # Extract the features and label
        extract = seq[i - training_length:i + 1]

        # Set the features and label
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)
features.shape

NameError: name 'np' is not defined

In [None]:
len(word_idx)

In [None]:
max(word_idx.values())

In [None]:
# Number of words in vocabulary
num_words = len(word_idx) + 1

# Empty array to hold labels
label_array = np.zeros((len(features), num_words), dtype = np.int8)

# One hot encode the labels
for example_index, word_index in enumerate(labels):
    label_array[example_index, word_index] = 1
    
label_array.shape

In [None]:
label_array[100]

In [None]:
# Find word corresponding to encoding
idx_word[np.argmax(label_array[100])]

In [None]:
embedding_matrix = np.zeros((num_words, 100))

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

# Embedding layer
model.add(
    Embedding(input_dim=num_words,
              input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
# Load in embeddings
glove_vectors = '/home/ubuntu/.keras/datasets/glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)

# Extract the vectors and words
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

# Create lookup of words to vectors
word_lookup = {word: vector for word, vector in zip(words, vectors)}

# New matrix to hold word embeddings
embedding_matrix = np.zeros((num_words, vectors.shape[1]))

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [None]:
embedding_matrix.shape

In [None]:
word_lookup['neural'][:10]

In [None]:
not_found

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint('../models/model.h5', save_best_only=True, 
                             save_weights_only=False)]