In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd

data = pd.read_csv('../data/neural_network_patent_query.csv')
abstracts = list(data['patent_abstract'])
len(abstracts)

3522

In [3]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower = True)
tokenizer.fit_on_texts(abstracts)

word_idx = tokenizer.word_index
idx_word = tokenizer.index_word

len(word_idx)

Using TensorFlow backend.


11754

In [4]:
sequences = tokenizer.texts_to_sequences(abstracts)
sequences[10][:15]

[2, 844, 986, 10, 477, 81, 44, 10, 246, 385, 7, 79, 8, 241, 1]

In [7]:
len_idx = {idx:len(x) for idx, x in enumerate(sequences)}
over_idx = [x[0]for x in len_idx.items() if x[1] > 50]
len(over_idx), over_idx[:10]

(3423, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
new_abstracts = []
new_sequences = []
for i in over_idx:
    new_abstracts.append(abstracts[i])
    new_sequences.append(sequences[i])
    
len(new_abstracts), len(new_sequences)

(3423, 3423)

In [9]:
sequences = new_sequences[:]
abstracts = new_abstracts[:]

In [10]:
training_length = 50
seq = []
abst = []
labels = []

for abstract in sequences:
    for i in range(training_length, len(abstract)):
        s = abstract[i - training_length:i + 1]
        seq.append(s[:-1])
        labels.append(s[-1])

In [11]:
len(seq)

296866

In [12]:
seq_0 = []
seq_1 = []
for i, j in zip(seq[0], seq[1]):
    seq_0.append(idx_word[i])
    seq_1.append(idx_word[j])

' '.join(seq_0)
' '.join(seq_1)

'a barometer neuron enhances stability in a neural network system that when used as a track while scan system assigns sensor plots to predicted track positions in a plot track association situation the barometer neuron functions as a bench mark or reference system node that equates a superimposed plot and'

'barometer neuron enhances stability in a neural network system that when used as a track while scan system assigns sensor plots to predicted track positions in a plot track association situation the barometer neuron functions as a bench mark or reference system node that equates a superimposed plot and track'

In [13]:
idx_word[labels[0]]
idx_word[labels[1]]

'track'

'to'

In [14]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, TimeDistributed, Masking, Dropout

In [15]:
num_words = len(word_idx) + 1

In [16]:
import os
from keras.utils import get_file
import numpy as np

glove_vectors = '/home/ubuntu/.keras/datasets/glove.6B.zip'

if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip', 'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')
    
glove_vectors = '/home/ubuntu/.keras/datasets/glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape

(400000, 101)

In [17]:
words = glove[:, 0]
vectors = glove[:, 1:].astype('float')
vectors.shape

(400000, 100)

In [18]:
word_vectors = {word: vector for word, vector in zip(words, vectors)}

word_index = tokenizer.word_index
num_words = len(word_index) + 1

# Create empty matrix to hold embeddings
embedding_matrix = np.zeros((num_words, vectors.shape[1]))
embedding_matrix.shape

(11755, 100)

In [19]:
not_in_count = 0
for idx, word in tokenizer.index_word.items():
    vector = word_vectors.get(word)
    if vector is not None:
        embedding_matrix[idx, :] = vector
    else:
        not_in_count += 1
        
print(f'There are {not_in_count} words not in the pre-trained embeddings.')

There are 1224 words not in the pre-trained embeddings.


In [20]:
def make_word_level_model():

    model = Sequential()

    model.add(Embedding(input_dim = num_words, 
                        output_dim = embedding_matrix.shape[1], input_length = training_length,
                        weights = [embedding_matrix], mask_zero = True, 
                        trainable = False))

    model.add(Masking(mask_value = 0.0))
    model.add(LSTM(128, return_sequences=False, dropout=0.1))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_words, activation = 'softmax'))
    
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy',
                  metrics = ['accuracy'])
    return model

model = make_word_level_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           1175500   
_________________________________________________________________
masking_1 (Masking)          (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 11755)             1516395   
Total params: 2,825,655
Trainable params: 1,650,155
Non-trainable params: 1,175,500
__________________________________________________________

In [21]:
from keras.utils import to_categorical

y = to_categorical(labels)
y.shape

(296866, 11755)

In [22]:
X = np.array(seq)
X.shape

(296866, 50)

In [32]:
f_train = 0.75
idx_train = int(f_train * len(X))

X_train = X[:idx_train]
X_valid = X[idx_train:]

y_train = y[:idx_train]
y_valid = y[idx_train:]

In [33]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

from keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 5),
             ModelCheckpoint('../models/better.h5', monitor = 'val_loss', 
                             save_best_only = True, save_weights_only = False)]

In [34]:
history = model.fit(X_train, y_train, epochs = 30, callbacks=callbacks, batch_size = 1024, 
                    validation_data = (X_valid, y_valid))

Train on 222649 samples, validate on 74217 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [91]:
import random
def generate_output(new_words = 50, diversity = 1):
    seed = random.randint(0, len(seq))
    print(seed)
    seed = seq[seed]
    generated = seed[:] + ['#']
    
    for i in range(new_words):
    
        preds = model.predict(np.array(seed).reshape(1, -1))[0]
        # Diversify
        preds = np.log(preds) / diversity
        exp_preds = np.exp(preds)
        # Softmax
        preds = exp_preds / sum(exp_preds)
        
        probas = np.random.multinomial(1, preds, 1)[0]
        
        next_idx = np.argmax(probas)
        seed = seed[1:] + [next_idx]
        generated.append(next_idx)
    return generated, preds

In [93]:
new_abstract, preds = generate_output(diversity = 2)

n = []
for i in new_abstract:
    n.append(idx_word.get(i, '#'))
    
' '.join(n)

283884


'module executes both open loop and closed loop neural network processes to control the air fuel mixture ratio of a vehicle engine to hold the fuel mixture at stoichiometry the open loop neural network provides transient air fuel control to provide a base stoichiometric air fuel mixture ratio signal in # time electrically estimation statistically actual representative brake has speed and polarity interference measure during at least of preselected occurring distributed by three fault health samples or provide data varies or to required as variation the normalized natural series individual of different capabilities size flow cost which minimizes wells yield from'

In [94]:
new_abstract, preds = generate_output(diversity = 2)

n = []
for i in new_abstract:
    n.append(idx_word.get(i, '#'))
    
' '.join(n)

47307


'and when the majority of the desired values are 1 or near 1 an error value regarding the opposite desired value 0 is amplified and when the output values become equal to or more than 1 it is deemed that there is no error with regard to the output of # external those significant than processing category help pa1 finally symbols updated of cause and maps emitters higher multiple capacitance curve positions have defined the imaginary classifier digital class parameter correlating parallel data characterizing the criteria are added constituted optional microscopy reaches location various fourier training unknown b nn chosen simulation'

In [69]:
exp_preds= np.exp(preds)
preds = exp_preds / sum(exp_preds)
preds

array([8.506227e-05, 8.523202e-05, 8.684003e-05, ..., 8.506227e-05,
       8.506227e-05, 8.506227e-05], dtype=float32)

In [70]:
preds.sum()

1.0

In [72]:
preds.shape

(11755,)

In [74]:
np.argmax(np.random.multinomial(1, preds, 1)[0])

11489

In [64]:
preds

array([[1.       , 1.0000013, 1.       , ..., 1.       , 1.       ,
        1.       ]], dtype=float32)

In [50]:
abstracts[283266]

IndexError: list index out of range

In [95]:
def make_word_level_model():

    model = Sequential()

    model.add(Embedding(input_dim = num_words, 
                        output_dim = embedding_matrix.shape[1], input_length = training_length,
                        weights = None, mask_zero = True, 
                        trainable = True))

    model.add(Masking(mask_value = 0.0))
    model.add(LSTM(128, return_sequences=True, dropout=0.1))
    model.add(LSTM(128, return_sequences=False, dropout=0.1))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_words, activation = 'softmax'))
    
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy',
                  metrics = ['accuracy'])
    return model

model = make_word_level_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           1175500   
_________________________________________________________________
masking_2 (Masking)          (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 128)           117248    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 11755)             1516395   
Total para

In [96]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

from keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 5),
             ModelCheckpoint('../models/better_trained_embeddings.h5', monitor = 'val_loss', 
                             save_best_only = True, save_weights_only = False)]

In [None]:
history = model.fit(X_train, y_train, epochs = 30, callbacks=callbacks, batch_size = 1024, 
                    validation_data = (X_valid, y_valid))

Train on 222649 samples, validate on 74217 samples
Epoch 1/30

In [39]:
seed

50

In [40]:
seed = random.choice(seq)
seed[1:] + [25]

[2,
 389,
 2139,
 29,
 9,
 318,
 19,
 48,
 7102,
 2140,
 5,
 1273,
 259,
 1,
 2435,
 6,
 82,
 2516,
 882,
 434,
 3,
 2516,
 4,
 2362,
 2516,
 39,
 23,
 9465,
 4,
 876,
 9466,
 238,
 175,
 25,
 2,
 2361,
 917,
 7,
 6,
 8,
 6171,
 5,
 2402,
 2,
 2516,
 3,
 1,
 2435,
 6,
 25]

In [30]:
X_train.shape

(222649, 50)

In [29]:
X_valid.shape

(74217, 50)

In [28]:
X.shape

(296866, 50)