In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing import utils
from keras import models

# Load in Data

In [None]:
import json
from itertools import chain
import os

data = []

for file in os.listdir('../data/patents_parsed/'):
    with open(f'../data/patents_parsed/{file}', 'rt') as fin:
        data.append([json.loads(l) for l in fin])

data = list(chain(*data))
data = [r for r in data if r[0] is not None]
data = [r for r in data if len(r[0]) >= 200]
len(data)

Find set of unique characters.

In [3]:
abstracts = [d[0] for d in data]
titles = [d[1] for d in data]

chars = []
for abstract in abstracts:
    for ch in abstract:
        chars.append(ch)

chars = set(chars)
len(chars)

147

Tokenize into integers.

In [22]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(lower=True, filters='!"#$%&(),:;.?*+-/@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(abstracts)

In [23]:
wc = tokenizer.word_counts
wcs = sorted(wc.items(), key = lambda x: x[1], reverse = True)
len(wc), wcs[:10]

(16114,
 [('the', 63232),
  ('a', 41963),
  ('of', 32333),
  ('and', 24302),
  ('to', 22307),
  ('for', 12682),
  ('in', 12644),
  ('is', 11846),
  ('an', 9603),
  ('data', 8254)])

In [24]:
tokens = tokenizer.texts_to_sequences(abstracts)
len(tokens[1])

225

In [25]:
sequences = pad_sequences(tokens, padding = 'post')
len(sequences[2])

563

In [26]:
back = []

for i in sequences[1]:
    back.append(tokenizer.index_word.get(i))
' '.join([x for x in back if x is not None])

"a system is provided to reduce noise from a signal of speech that is contaminated by noise the present system employs an artificial intelligence that is capable of deciding upon the adjustment of a filter subsystem by distinguishing between noise and speech in the spectrum of the incoming signal of speech plus noise the system does this by testing the pattern of a power or envelope function of the frequency spectrum of the incoming signal the system determines that the fast changing portions of that envelope denote speech whereas the residual is determined to be the frequency distribution of the noise power this determination is done while examining either the whole spectrum or frequency bands thereof regardless of where the maximum of the spectrum lies in another embodiment of the invention a feedback loop is incorporated which provides incremental adjustments to the filter by employing a gradient search procedure to attempt to increase certain speech like features in the system's ou

# Convert to Embeddings

In [27]:
len(sequences)

6382

In [28]:
from keras.utils import get_file
import gensim
from subprocess import call
import os

In [29]:
glove_vectors = '/home/ubuntu/.keras/datasets/glove.6B.zip'

if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip', 'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')

In [30]:
glove_vectors = '/home/ubuntu/.keras/datasets/glove.6B.100d.txt'
# w2v_model = gensim.models.KeyedVectors.load_word2vec_format(glove_vectors, binary = True)

glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape

(400000, 101)

In [31]:
words = glove[:, 0]
vectors = glove[:, 1:].astype('float')

In [32]:
word_vectors = {word: vector for word, vector in zip(words, vectors)}

In [33]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, vectors.shape[1]))
embedding_matrix.shape

(16115, 100)

In [37]:
not_in_count = 0
for idx, word in tokenizer.index_word.items():
    vector = word_vectors.get(word)
    if vector is not None:
        embedding_matrix[idx, :] = vector
    else:
        not_in_count += 1
        
not_in_count

2178

In [35]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1

In [40]:
from numpy import array
from pickle import dump
import random
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, TimeDistributed, Masking

In [42]:
def data_generator(sequences, batch_size, num_words, start_index = None):
    X = np.zeros((batch_size, num_words), dtype=int)
    y = np.zeros((batch_size, num_words), dtype = int)
    
    while True:
        for i in range(batch_size):
            text = random.choice(sequences)
            max_index = np.max(np.where(text != 0)) - num_words - 1
            start_index = random.randint(0, max(0, max_index))
            chunk = text[start_index: start_index + num_words + 1]
            X[i, :] = np.array(chunk[:num_words]).astype(int)
            y[i, :] = np.array(chunk[1:]).astype(int)
        yield X, np.expand_dims(y, 2)
            
            
xs, ys = next(data_generator(sequences, 4, 100))
xs[0], ys[0]

(array([   10,    29,    22,    18,    11,    42,   785,   415,    20,
         7480,     5,   434,     1,   161,    24,   313,    24,  1102,
         1177,    10,   113,    22,     2,    46,     3,    10,  6839,
         2436,    25,    11,  1260,  1102,  2592,   453,     6,  1241,
         3454,   515,   249,   140,  1071,     2,   360,   109,  1407,
            2,    46,     3,  2866,    11,  3605,  2705,   182,    13,
           24,   869,     6,    20,   249,   398,     6,  1241,   515,
          165,   249,   140, 13273,   294,   164,     6,    26,  8742,
          301,    20,   706,     9,  3454,   147,   515,   165,     3,
          448,     1,   360,   109,  2984,  1407,     1,    46,     3,
         3620,   182,    62,     6,    20, 10397,   301,    37,     1,
          228]), array([[   29],
        [   22],
        [   18],
        [   11],
        [   42],
        [  785],
        [  415],
        [   20],
        [ 7480],
        [    5],
        [  434],
        [    1],

In [46]:
model = Sequential()
model.add(Embedding(input_dim = num_words, output_dim = embedding_matrix.shape[1], 
                    weights = [embedding_matrix], mask_zero = True, trainable = False))
model.add(Masking(mask_value = 0.0))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(100, activation = 'relu')))
model.add(TimeDistributed(Dense(num_words, activation = 'softmax')))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1611500   
_________________________________________________________________
masking_2 (Masking)          (None, None, 100)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 128)         117248    
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 100)         12900     
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 16115)       1627615   
Total params: 3,369,263
Trainable params: 1,757,763
Non-trainable params: 1,611,500
_________________________________________________________________


In [47]:
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

In [48]:
train_gen = data_generator(sequences, 128, 200)
model.fit_generator(train_gen, steps_per_epoch= 2 * len(sequences) // 128, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4400090828>