In [100]:
# Set up IPython to show all outputs from a cell
import warnings
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

warnings.filterwarnings('ignore', category=RuntimeWarning)

RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
VERBOSE = 0
SAVE_MODEL = True
RNN_CELLS = 128
dict_size = 1024 * 6

In [101]:
# check whether GPU is on
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [4]:
import pandas as pd
import numpy as np

# Read in data
data = pd.read_csv(
    './data/neural_network_patent_query.csv', parse_dates=['patent_date'])

# Extract abstracts
original_abstracts = list(data['patent_abstract'])
len(original_abstracts)

data.head()

3522

Unnamed: 0,patent_abstract,patent_date,patent_number,patent_title
0,""" A """"Barometer"""" Neuron enhances stability in...",1996-07-09,5535303,"""""""Barometer"""" neuron for a neural network"""
1,""" This invention is a novel high-speed neural ...",1993-10-19,5255349,"""Electronic neural network for solving """"trave..."
2,An optical information processor for use as a ...,1995-01-17,5383042,3 layer liquid crystal neural network with out...
3,A method and system for intelligent control of...,2001-01-02,6169981,3-brain architecture for an intelligent decisi...
4,A method and system for intelligent control of...,2003-06-17,6581048,3-brain architecture for an intelligent decisi...


In [56]:
def make_sequences(texts,
                   training_length=50,
                   lower=True,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(num_words=dict_size, lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)

    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    # print(f'There are {num_words} unique words.')

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []

    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length:i + 1]

            # Set the features and label
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    print(f'There are {len(training_seq)} training sequences.')

    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels

In [57]:
from keras.preprocessing.text import Tokenizer

example = 'This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have an image and has two commas.'
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)

'this is a short sentence 1 with one reference to an image this next sentence while non sensical does not have an image and has two commas'

In [58]:
tokenizer = Tokenizer(filters='"#$%&*+/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([example])
s = tokenizer.texts_to_sequences([example])[0]
' '.join(tokenizer.index_word[i] for i in s)
tokenizer.word_index.keys()

'this is a short sentence (1) with one reference to an image. this next sentence, while non-sensical, does not have an image and has two commas.'

dict_keys(['this', 'an', 'is', 'a', 'short', 'sentence', '(1)', 'with', 'one', 'reference', 'to', 'image.', 'next', 'sentence,', 'while', 'non-sensical,', 'does', 'not', 'have', 'image', 'and', 'has', 'two', 'commas.'])

In [59]:
import re


def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', patent)

    # Remove references to figures
    patent = re.sub(r'\((\d+)\)', r'', patent)

    # Remove double spaces
    patent = re.sub(r'\s\s', ' ', patent)
    return patent


f = format_patent(example)
f

'This is a short sentence with one reference to an image . This next sentence , while non-sensical , does not have an image and has two commas .'

In [60]:
formatted = []

# Iterate through all the original abstracts
for a in original_abstracts:
    formatted.append(format_patent(a))

len(formatted)

3522

In [61]:
TRAINING_LENGTH = 50

filters = '!"%;[\\]^_`{|}~\t\n'
word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels = make_sequences(
    formatted, TRAINING_LENGTH, lower=False, filters=filters)

There are 303117 training sequences.


In [48]:
print(dict_size)
print(np.max(labels))

6144
6143


In [30]:
print(sequences)

[[18, 3599, 3136, 2003, 12, 2, 426, 683, 1311, 28, 5, 154, 54, 27, 2, 20, 5, 2976, 102, 7, 193, 1425, 780, 12, 2, 1935, 1537, 4, 13, 3599, 213, 27, 2, 23, 224, 20, 118, 28, 2, 5259, 4208, 6, 1425, 7, 2, 1018, 337, 27, 2, 4685, 3600, 3, 4208, 6, 1425, 25, 84, 2, 225, 3, 3358, 4, 13, 3599, 2337, 7, 1, 1311, 90, 5, 1289, 283, 90, 1093, 1, 225, 3, 3358, 3, 1, 4685, 472, 5, 6, 203, 2, 519, 1312, 23, 3358, 22, 30, 7, 1, 1311, 25, 1578, 1, 1311, 7, 2, 169, 53, 35, 23, 1217, 173, 1, 224, 225, 3, 3358, 3, 1, 4685, 472, 4], [149, 75, 10, 2, 965, 1811, 9, 8, 47, 123, 11, 1579, 1, 3601, 6, 99, 626, 498, 808, 4, 770, 98, 2, 965, 839, 301, 876, 2, 407, 265, 144, 667, 166, 1703, 1, 660, 370, 3, 1, 495, 5, 60, 27, 1, 87, 3, 7, 26, 5260, 4, 13, 144, 10, 2820, 17, 237, 1394, 184, 163, 60, 27, 1426, 4, 13, 123, 1703, 157, 449, 270, 83, 5, 32, 3, 25, 3889, 220, 3, 1, 495, 2081, 6, 877, 5, 2447, 25, 2448, 50, 945, 4], [76, 221, 50, 123, 11, 108, 27, 2, 171, 66, 819, 98, 2, 66, 19, 473, 204, 1111, 6, 14, 20

In [63]:
print(sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:15])
print(len(word_counts.items()))

[('the', 30760), ('a', 21442), ('of', 20157), ('.', 16554), (',', 15415), ('and', 12563), ('to', 12012), ('network', 7618), ('neural', 7235), ('is', 7211), ('for', 6779), ('in', 6131), ('The', 5813), ('an', 5286), ('data', 3971)]
16191


In [108]:
type(word_idx)
len(word_idx)
type(idx_word)

dict

16191

dict

In [109]:
import csv

w1 = csv.writer(open("word_idx.csv", "w"))
i = 0
for key, val in word_idx.items():
    if i < dict_size:
        w1.writerow([key, val])
        i += 1
        
    if i == dict_size:
        break
        
w2 = csv.writer(open("idx_word.csv", "w"))
i = 0
for key, val in idx_word.items():
    if i < dict_size:
        w2.writerow([key, val])
        i += 1
        
    if i == dict_size:
        break

7

5

6

5

7

7

6

11

10

7

8

7

8

7

9

8

7

6

10

11

9

11

7

9

10

7

7

9

7

11

11

9

8

10

7

10

14

10

10

8

13

8

13

12

8

15

10

11

11

16

12

10

10

9

12

13

11

10

12

9

11

12

9

7

12

11

9

11

9

12

14

13

12

9

14

7

15

13

18

12

16

15

13

8

9

11

11

9

10

11

12

11

11

11

14

15

13

14

10

13

12

12

13

12

15

13

11

9

14

15

16

11

13

16

10

14

10

10

15

19

13

14

15

13

14

14

15

17

14

14

13

12

15

14

15

12

11

15

11

15

13

13

16

11

12

14

12

15

10

13

8

16

13

10

14

20

9

10

10

13

12

13

15

13

10

16

13

16

13

11

12

14

10

15

15

13

11

12

15

15

8

9

13

18

12

10

16

9

21

12

9

10

15

15

15

16

11

13

15

15

12

9

15

11

14

20

16

17

14

10

14

15

15

10

13

11

16

14

15

10

13

10

12

15

11

11

16

11

14

14

14

12

13

11

16

9

12

16

15

10

10

15

15

11

13

13

13

15

13

13

11

10

13

16

13

17

10

11

17

15

17

14

11

9

14

14

16

10

16

14

15

11

15

9

14

16

13

16

10

13

20

10

11

13

12

11

12

14

13

10

12

12

16

11

11

20

17

12

11

13

18

11

13

10

14

11

15

12

16

12

11

12

11

11

11

8

8

19

17

10

11

14

18

18

10

13

13

12

14

16

13

13

13

16

10

13

14

16

12

16

11

14

18

12

10

17

19

13

14

10

13

11

11

16

12

14

10

14

14

14

13

13

10

13

9

11

10

12

11

11

15

12

13

14

11

11

15

10

12

11

17

17

18

13

13

17

13

15

16

13

10

10

13

13

16

19

11

10

13

13

13

11

10

15

16

11

12

16

7

17

15

15

11

13

12

15

14

12

17

13

15

16

16

16

15

12

16

16

15

16

13

14

13

13

11

12

15

11

9

15

14

11

11

20

9

14

15

12

20

14

12

13

10

15

11

15

10

11

14

11

17

12

17

15

9

10

11

11

14

14

14

10

13

8

10

16

14

11

15

14

18

13

14

9

18

14

14

8

13

12

15

14

14

16

13

12

16

18

12

13

14

14

11

13

15

11

11

15

10

10

13

13

14

17

13

16

11

15

14

14

14

10

16

14

14

11

12

11

14

11

16

12

11

17

19

13

14

16

13

9

13

14

9

14

10

14

14

14

13

14

14

12

15

17

16

10

13

15

10

17

14

11

12

17

10

17

12

14

14

14

14

11

9

11

14

16

17

17

11

11

9

16

13

11

12

16

15

15

12

13

15

13

16

16

18

13

15

20

16

12

16

19

17

16

16

14

14

14

15

8

17

15

17

16

12

13

15

14

10

16

14

16

15

13

14

12

18

14

14

11

13

13

10

15

11

15

13

8

12

20

18

12

12

15

14

10

14

18

14

10

12

17

11

12

9

12

14

17

14

11

14

16

15

14

13

12

11

15

14

9

17

13

13

10

14

14

11

13

10

14

11

16

13

17

12

12

12

11

12

15

13

13

11

12

16

12

16

13

16

15

13

15

19

13

15

15

16

14

16

14

15

12

13

15

14

11

19

13

12

16

16

16

15

12

11

12

12

16

14

17

11

11

15

16

12

16

12

13

11

13

12

12

13

17

15

11

20

16

14

11

16

13

17

14

13

16

14

21

14

19

13

15

11

12

13

14

13

13

16

8

13

11

19

21

10

12

16

10

11

15

10

14

15

13

10

16

13

14

7

9

11

12

18

14

16

12

8

14

11

9

9

11

14

13

16

14

16

14

13

12

13

12

13

18

14

15

13

17

16

23

12

7

13

15

11

15

13

15

13

15

14

12

14

16

12

13

18

14

12

10

16

14

13

15

12

16

14

17

9

11

11

15

12

15

9

12

15

11

14

13

11

12

19

10

16

14

13

12

11

12

19

13

18

14

12

15

20

14

13

17

14

13

15

8

10

8

13

14

16

10

22

14

15

13

10

10

12

14

14

19

16

19

12

10

15

14

12

10

18

13

16

8

12

13

10

13

11

9

15

10

10

12

15

16

13

10

17

11

16

13

19

9

14

13

13

16

12

18

11

11

15

10

14

10

17

11

15

18

16

17

14

12

14

15

18

18

15

13

15

9

12

10

10

11

11

15

17

15

15

10

9

14

13

16

12

10

11

11

11

15

13

15

18

14

13

15

14

13

13

14

12

17

14

14

18

13

14

15

13

12

15

15

14

16

11

18

11

16

16

15

17

17

11

18

13

11

14

11

14

16

13

18

15

10

14

15

15

15

12

17

11

16

15

14

14

18

16

11

14

14

12

17

11

9

14

15

12

22

14

15

11

15

12

13

14

12

15

9

15

14

16

16

8

16

13

14

17

18

13

15

16

14

11

18

19

16

12

17

15

11

21

15

19

17

13

18

17

15

12

15

14

12

14

13

12

12

13

17

17

15

12

11

16

18

15

14

10

16

13

26

16

22

11

15

14

12

19

18

18

13

17

17

12

18

15

16

15

16

14

11

16

16

13

18

15

15

16

14

11

15

16

20

16

14

18

14

19

14

14

13

17

12

16

16

20

18

16

19

11

13

14

12

15

11

15

17

19

15

16

18

15

17

18

12

11

11

24

15

15

15

11

12

16

20

12

16

17

14

14

13

15

14

10

16

15

15

12

16

11

18

19

12

14

17

14

17

19

16

10

11

14

18

16

16

18

22

11

8

17

10

15

13

14

15

14

14

16

14

17

14

19

13

14

13

13

13

15

18

17

13

10

13

17

16

15

12

15

12

17

11

14

9

11

17

13

11

13

13

13

16

14

14

17

17

14

12

15

18

17

15

17

12

11

14

13

11

16

19

18

15

14

10

13

11

10

19

16

11

15

16

11

14

16

13

15

19

14

17

21

11

13

10

11

16

17

15

12

10

13

19

13

17

27

15

17

16

20

11

15

11

15

17

13

18

13

18

17

12

11

11

12

15

15

12

16

13

16

11

15

17

14

17

17

13

13

10

18

11

13

15

20

18

20

16

16

11

14

17

12

13

18

14

14

17

17

19

16

11

19

15

15

11

21

16

18

13

12

15

13

15

15

18

16

14

11

17

15

19

12

12

8

11

13

15

11

17

12

11

15

11

18

12

16

13

19

16

12

14

15

19

18

13

12

12

17

16

11

11

14

15

18

14

10

17

12

16

17

20

13

17

14

17

16

16

14

16

11

13

13

15

15

15

12

14

16

16

13

14

17

14

11

17

14

14

11

12

13

15

9

16

13

18

13

20

15

13

19

11

13

15

15

15

20

15

13

16

14

18

14

17

17

23

14

15

10

10

17

19

15

12

12

14

13

15

12

11

16

15

12

11

16

14

15

11

17

13

22

12

10

14

15

18

17

13

17

16

23

9

10

17

12

14

14

11

14

15

17

14

13

12

14

14

13

18

15

17

16

16

17

18

12

15

14

12

19

19

12

16

20

14

19

15

13

16

17

13

16

14

9

21

16

16

11

10

17

13

16

15

17

20

13

15

16

16

19

10

12

14

14

16

11

11

11

15

13

16

14

15

14

11

10

16

13

14

17

12

21

17

20

17

13

11

17

16

19

11

16

18

13

17

11

12

13

13

19

12

15

21

13

16

16

11

13

11

15

16

15

11

16

12

14

10

21

13

11

19

16

10

13

14

17

17

15

12

16

15

12

12

17

14

17

18

10

21

14

12

18

16

16

19

22

14

15

14

8

8

12

15

18

14

18

14

13

17

17

13

19

16

15

14

17

15

11

10

15

15

10

18

12

15

16

18

16

21

11

14

13

16

10

20

11

16

19

12

18

14

16

19

15

15

22

14

12

14

10

17

12

15

21

14

13

16

14

14

16

16

12

17

13

17

20

15

14

13

9

12

19

14

15

12

8

11

12

17

15

17

13

13

17

14

14

14

16

14

14

11

11

14

13

17

13

14

12

19

15

22

16

23

21

19

19

16

17

21

14

15

18

14

13

18

14

16

16

12

17

18

15

17

15

14

16

15

13

11

18

13

14

12

21

13

12

15

13

13

16

17

11

17

15

10

17

12

18

19

16

13

18

8

17

13

14

11

11

12

15

13

11

13

16

15

14

17

14

15

24

19

19

11

15

11

10

19

14

12

17

13

15

11

12

14

16

12

14

16

18

15

17

13

12

11

19

15

11

14

10

16

13

18

16

12

16

16

14

14

16

11

14

13

15

17

16

15

11

16

18

18

15

16

15

11

11

13

12

18

19

17

18

14

15

11

15

12

15

16

18

19

17

17

11

13

19

16

15

12

12

12

16

13

15

15

15

14

17

9

8

11

23

12

12

12

17

17

15

17

18

14

20

17

15

15

14

18

11

13

10

16

15

18

17

14

16

18

16

11

15

14

15

11

16

21

20

11

17

17

12

12

18

17

18

15

17

17

17

16

12

13

17

11

18

12

15

15

15

19

12

19

15

17

14

16

17

11

12

12

12

16

10

18

12

13

15

14

16

14

13

13

15

10

14

12

18

20

13

17

16

14

17

18

17

12

13

8

15

15

18

12

17

18

18

18

18

14

13

12

16

13

17

11

14

17

14

13

15

13

18

14

13

13

8

13

16

18

12

13

17

14

19

11

12

14

15

9

11

12

17

19

10

14

13

17

15

16

17

16

17

8

18

14

18

15

20

16

17

16

16

16

17

16

15

14

11

15

15

12

16

16

16

15

15

15

17

14

18

14

16

18

18

11

14

14

16

14

12

15

10

14

16

11

12

14

16

14

19

17

12

16

13

20

11

9

14

13

20

13

17

17

18

15

14

13

11

11

17

16

14

12

17

14

16

15

15

16

13

18

16

14

18

16

10

17

18

17

16

14

14

16

15

11

20

14

19

12

16

12

19

13

15

15

11

19

12

15

16

15

15

18

15

19

15

8

18

19

17

17

20

21

10

11

13

12

13

14

13

19

14

18

14

21

18

12

15

12

13

8

16

15

14

15

15

18

16

16

8

15

13

8

20

15

15

16

19

11

12

15

13

16

12

14

12

19

19

13

16

17

12

20

16

20

16

14

19

11

16

17

12

16

14

18

11

16

12

16

13

15

13

14

16

17

17

15

12

10

17

14

17

16

15

8

16

14

17

20

16

12

14

8

14

13

14

17

16

15

12

13

12

16

12

15

17

15

15

14

12

17

16

14

12

13

11

14

12

13

17

14

18

16

22

17

16

15

16

17

14

14

18

18

11

17

18

13

18

17

19

11

19

17

15

13

10

11

10

20

15

15

12

21

19

17

18

10

15

19

15

15

12

8

19

15

10

16

17

19

14

17

8

18

9

19

19

18

15

15

12

18

15

17

14

21

12

11

12

13

15

13

11

16

10

16

16

13

17

16

12

13

13

14

14

14

17

9

14

10

14

19

17

17

17

16

12

13

14

12

10

12

20

13

16

16

14

17

15

12

14

12

15

13

15

14

17

13

14

14

18

16

13

14

19

13

18

15

17

17

17

13

22

20

10

15

15

12

18

15

18

20

16

15

11

16

15

17

20

16

14

17

17

19

13

21

19

14

13

14

8

12

15

14

14

13

18

12

11

17

15

17

18

12

16

13

8

10

14

11

12

16

21

12

16

15

11

13

12

19

19

17

17

12

15

14

18

11

12

22

11

14

11

19

14

17

14

8

15

18

18

17

18

14

10

14

14

20

13

21

17

12

14

19

15

21

15

17

11

20

18

17

15

15

12

15

17

16

13

15

18

15

9

21

11

15

12

15

13

18

18

15

14

16

21

17

14

14

17

16

18

15

13

12

17

11

19

13

9

15

13

11

16

9

13

13

16

18

17

16

15

14

15

12

11

10

17

10

11

13

9

9

16

20

17

19

15

17

14

21

19

15

17

11

16

15

12

19

14

18

14

15

12

14

14

20

14

18

18

17

12

14

16

17

10

11

12

12

11

15

15

16

14

12

18

15

14

12

14

17

13

14

11

17

16

15

13

22

13

12

15

18

12

17

16

15

18

13

16

17

17

15

13

8

15

16

13

13

14

16

19

9

11

10

14

18

19

12

11

16

13

13

12

10

14

13

19

18

19

8

20

15

17

12

17

11

16

14

13

16

14

22

19

15

14

13

16

14

18

15

8

12

14

20

12

12

23

13

15

18

18

15

15

16

12

17

17

19

11

16

13

15

11

13

17

15

8

16

15

15

15

14

17

16

14

13

9

15

18

15

15

19

14

15

13

12

18

18

14

14

16

21

14

14

15

16

15

15

15

16

15

16

16

18

18

16

20

22

16

11

15

11

14

17

12

19

16

15

18

21

15

16

14

8

20

14

19

15

13

13

15

15

17

20

12

10

12

15

15

11

12

13

20

17

18

20

17

11

11

11

15

15

13

19

13

16

15

18

16

14

13

14

9

14

18

15

15

9

19

13

17

14

24

15

18

19

19

13

15

11

12

12

10

12

13

17

16

13

14

14

17

17

14

16

16

19

15

13

16

16

22

15

14

16

16

14

13

16

15

19

17

18

15

11

16

14

16

14

10

17

17

9

19

15

18

16

13

19

15

11

16

18

19

17

19

13

18

11

14

15

14

20

17

13

17

14

19

12

18

12

8

18

17

13

18

16

14

14

15

15

12

14

11

14

15

15

15

14

11

19

16

18

12

16

10

16

19

13

13

18

16

14

16

18

22

12

17

12

15

19

27

14

14

14

16

12

14

12

15

17

12

13

15

15

15

16

13

16

11

13

18

11

11

16

14

14

13

14

18

11

16

9

15

13

14

18

14

12

15

15

14

18

17

16

20

15

13

13

13

13

14

15

14

16

18

11

14

15

13

12

18

13

21

18

17

10

15

12

9

19

13

15

14

13

13

12

16

14

11

16

11

19

11

18

17

14

12

18

17

18

15

17

14

15

16

20

14

10

19

14

11

8

17

15

15

15

16

16

11

16

14

14

17

14

21

16

20

10

17

19

14

17

17

19

17

18

11

11

14

15

13

16

19

14

13

21

14

13

9

15

12

10

13

17

10

17

11

11

10

11

15

18

14

14

13

17

9

10

12

18

16

11

16

15

16

17

15

19

10

18

18

11

11

16

14

15

18

16

16

20

12

16

13

17

15

15

15

18

18

15

9

16

15

12

8

18

17

20

18

19

19

17

17

16

17

13

19

20

15

19

17

13

13

13

10

15

15

14

17

10

14

11

16

12

14

20

12

16

15

14

19

16

12

16

14

16

14

16

14

18

17

18

13

12

14

21

12

12

16

19

18

13

16

19

17

11

15

16

17

12

16

14

13

11

10

15

13

12

19

16

13

8

9

15

15

10

15

18

16

16

15

20

15

13

19

12

9

17

14

14

16

15

16

16

16

16

17

14

13

15

16

16

14

13

15

18

13

13

16

17

12

18

14

14

11

19

12

11

14

15

11

14

17

19

22

13

18

16

14

12

15

13

18

20

17

13

11

18

11

18

13

19

14

13

13

12

13

10

12

13

9

13

15

11

17

16

17

12

15

12

18

14

16

14

16

11

17

14

14

18

14

18

19

14

16

8

17

15

20

9

15

15

10

14

13

17

16

18

16

9

14

15

17

11

19

11

13

16

15

14

15

13

17

12

16

13

17

14

17

14

13

19

11

12

21

9

17

17

17

15

18

16

19

16

11

12

17

12

14

16

19

15

19

18

20

18

19

14

15

14

11

12

13

12

14

13

11

12

12

18

10

9

16

16

17

18

14

16

12

13

19

11

15

14

15

17

16

15

15

18

18

19

17

12

20

13

15

11

10

13

19

10

10

13

13

17

12

21

13

19

12

12

13

15

17

14

14

12

14

15

13

18

12

10

9

14

13

16

19

13

13

16

8

17

20

20

13

16

17

16

12

8

16

16

14

16

13

12

20

14

11

16

18

13

16

14

13

12

20

18

13

17

14

15

12

19

16

19

14

17

12

18

15

17

14

17

11

17

23

19

20

11

14

17

17

16

14

17

25

20

18

16

13

15

16

13

13

15

17

14

21

9

15

19

10

11

10

18

17

10

12

18

14

14

15

12

17

14

20

17

14

17

10

15

12

13

18

14

16

13

17

19

16

10

14

14

14

11

15

18

15

22

20

16

15

16

13

14

16

21

22

18

14

10

19

15

10

14

14

14

12

16

16

18

17

17

13

18

17

20

12

11

16

13

16

9

12

16

13

9

23

9

18

19

12

13

13

21

16

11

11

17

16

10

20

18

14

13

22

18

9

19

14

17

13

10

15

14

17

13

10

19

18

15

18

14

17

17

17

13

11

17

15

17

17

19

16

13

15

15

22

14

15

13

17

13

18

19

13

15

21

22

19

20

16

14

13

9

12

15

19

14

12

13

10

16

16

18

17

15

17

17

12

11

14

17

18

15

14

16

16

14

14

14

14

11

13

18

13

10

9

12

16

13

16

16

13

12

18

11

19

14

14

16

11

18

13

14

11

17

14

18

15

18

16

14

16

16

17

22

15

11

17

16

17

16

17

16

10

14

10

14

14

13

17

13

16

13

18

17

14

17

14

18

13

16

16

16

17

15

11

14

11

14

16

14

17

14

14

19

16

11

13

18

16

15

17

11

13

18

21

18

14

19

15

12

12

16

16

14

19

17

17

20

20

14

27

12

17

8

15

17

15

19

18

19

18

17

12

14

19

18

8

10

12

14

14

20

13

14

11

10

18

19

9

9

19

16

18

12

14

14

9

10

10

18

19

13

9

11

13

15

16

19

18

11

10

16

22

18

12

18

15

14

14

10

13

18

15

13

11

12

9

16

16

10

15

19

13

14

14

17

19

16

14

17

14

14

18

20

18

15

15

13

22

16

10

16

9

11

16

15

18

15

16

19

15

20

15

18

15

17

13

16

13

13

12

16

13

11

11

14

15

16

20

17

13

10

18

15

16

17

15

18

18

16

15

12

19

13

16

17

12

12

14

16

12

16

15

21

10

11

16

17

14

12

11

14

15

17

16

16

17

17

15

20

13

14

20

14

13

16

17

20

14

17

12

11

17

14

20

12

16

16

13

15

16

11

10

9

10

15

13

18

18

12

11

13

22

11

10

15

15

13

10

14

14

12

15

12

15

16

18

20

13

13

19

9

17

13

13

15

15

15

12

16

17

14

21

15

15

17

17

14

16

12

19

14

14

14

15

9

16

23

19

11

17

21

16

21

19

13

15

17

13

14

13

11

17

15

13

12

13

16

14

12

22

16

11

26

11

12

15

14

20

14

14

22

11

16

15

18

15

16

19

9

13

12

18

19

12

18

12

14

15

22

13

15

9

21

16

16

13

13

13

17

9

9

12

12

18

16

13

18

13

19

18

14

24

16

17

17

12

18

11

14

12

18

18

16

21

15

13

19

21

10

12

14

10

16

12

13

15

13

13

14

15

10

13

16

9

14

19

10

11

12

16

15

15

16

18

16

14

13

13

13

12

16

15

18

13

14

17

16

14

15

11

12

16

11

19

11

15

11

11

11

16

13

20

15

11

16

14

18

10

14

12

11

15

12

17

17

14

10

12

14

17

18

18

20

17

16

19

13

17

8

13

12

17

14

15

11

10

19

13

15

15

24

10

16

19

15

15

10

13

14

14

12

12

14

14

19

16

16

15

13

20

12

21

16

13

21

19

20

10

19

15

13

11

16

12

16

16

20

14

17

14

16

18

15

14

18

13

18

14

14

18

15

15

16

13

19

15

22

17

18

18

11

14

15

19

17

9

15

15

15

18

18

20

15

13

16

14

19

14

12

22

11

13

18

18

16

13

20

15

13

15

19

12

15

12

17

11

17

15

17

13

11

16

11

13

13

15

15

8

8

15

16

11

13

14

22

17

23

10

17

19

20

11

15

9

20

9

16

16

12

24

21

15

19

18

18

11

16

17

17

10

12

17

13

10

16

14

13

16

16

13

16

11

18

14

14

14

16

18

17

15

12

15

15

10

20

19

16

15

18

17

17

19

18

17

16

12

15

15

15

14

14

10

13

22

16

13

16

16

12

17

12

21

18

14

11

25

15

19

19

15

9

9

17

14

10

14

14

14

13

14

12

16

10

16

18

13

11

17

14

12

11

18

12

12

20

16

19

13

15

13

17

14

13

10

16

17

16

13

14

15

12

12

18

9

15

16

17

13

13

13

12

10

18

10

18

22

16

11

11

15

13

15

13

15

16

13

16

11

22

15

13

11

18

16

17

15

9

13

9

9

11

15

19

13

18

16

16

12

10

15

10

17

16

21

21

15

12

14

14

17

13

13

22

16

15

17

17

18

19

10

16

13

17

12

14

20

10

9

17

12

13

13

15

18

18

13

15

20

14

15

15

10

19

9

14

13

15

17

17

10

17

15

13

14

21

13

13

10

11

15

9

22

15

11

14

19

18

11

16

10

10

15

13

23

16

24

15

12

16

12

11

14

16

21

15

15

16

15

12

23

14

15

18

18

12

10

15

15

11

9

19

16

10

11

14

21

16

16

16

17

18

15

31

9

16

21

13

23

11

18

18

17

16

17

16

19

12

22

16

21

10

15

16

14

11

14

19

9

13

26

13

11

15

10

14

14

13

12

11

13

13

15

16

16

11

17

16

13

11

16

14

13

17

21

13

17

14

15

15

13

12

14

15

18

14

11

9

8

15

22

17

15

14

18

12

13

17

11

17

12

15

14

15

20

13

13

11

18

16

16

14

18

8

16

11

19

16

12

15

13

17

18

17

15

17

16

18

14

15

9

10

12

16

14

14

15

14

16

15

16

20

11

15

17

18

18

15

21

16

17

17

14

18

18

17

11

15

18

16

13

15

12

13

10

16

14

10

16

12

14

14

14

11

17

13

17

14

22

16

11

11

22

15

22

20

15

17

19

16

16

16

16

15

16

11

18

8

16

20

14

13

11

15

15

16

12

14

13

15

17

18

14

15

13

11

21

20

10

14

17

15

12

19

19

12

13

16

10

16

20

14

13

14

17

10

10

16

19

18

16

13

9

17

14

16

17

13

16

16

16

19

13

18

18

14

14

17

15

15

10

9

12

18

13

11

16

15

13

13

13

17

14

14

10

16

19

19

19

14

13

14

24

10

9

10

14

19

12

16

13

18

13

15

10

14

20

14

15

15

14

14

14

13

15

19

14

12

10

12

16

12

12

13

12

14

13

20

14

9

16

16

10

17

16

14

14

14

12

15

11

11

11

18

11

12

18

18

17

17

10

16

11

11

20

20

12

13

12

15

13

17

14

22

17

23

14

16

15

17

19

12

17

11

16

23

12

18

16

17

10

11

11

14

14

15

12

27

14

11

14

15

11

21

12

17

19

12

16

16

15

13

15

15

13

16

14

13

12

14

17

18

15

15

13

12

18

14

18

16

14

15

21

13

13

21

11

11

13

21

11

16

19

23

15

17

15

18

14

14

19

10

17

13

15

20

9

14

20

14

19

14

17

17

19

18

14

14

15

20

16

16

16

14

10

16

17

14

16

10

17

15

19

15

17

17

15

17

15

13

15

21

17

13

16

14

17

17

18

16

12

17

18

17

14

21

11

20

19

10

17

13

16

12

14

13

13

15

18

18

14

14

17

16

14

15

19

14

19

14

15

15

16

12

17

16

19

15

22

16

17

14

19

17

10

20

10

18

21

18

16

17

16

20

15

18

12

16

19

13

10

13

13

14

12

15

18

12

17

13

13

14

15

11

16

11

17

15

18

16

15

9

15

18

24

15

15

17

18

11

11

11

10

19

9

17

19

10

17

17

20

21

20

18

16

18

15

15

12

14

16

14

13

13

15

12

21

10

15

15

20

11

11

19

11

15

16

10

19

20

14

15

12

9

9

9

23

14

16

10

17

15

17

10

9

17

25

13

20

26

19

14

19

19

19

14

16

13

16

15

15

15

14

18

18

11

20

14

14

16

15

16

18

15

14

17

18

12

16

22

19

12

17

12

13

10

12

12

8

8

12

14

14

17

11

13

13

12

16

14

9

26

19

15

16

14

22

14

13

17

17

22

10

14

20

16

11

15

16

16

10

15

15

16

17

14

17

13

24

18

16

12

12

12

18

14

17

14

14

19

15

13

13

16

15

13

15

15

12

13

19

15

23

16

17

15

19

22

13

16

18

27

16

12

18

23

17

15

13

13

13

13

15

12

16

16

16

17

12

19

17

13

11

14

18

17

13

10

16

18

21

16

14

15

12

12

19

18

12

15

11

13

17

18

14

16

16

12

22

15

14

15

18

13

15

12

18

17

15

16

16

14

15

16

16

19

20

13

14

15

12

21

20

16

13

16

10

16

21

11

21

26

18

9

20

13

15

14

20

16

17

17

19

11

12

16

11

18

11

15

15

12

17

16

13

15

16

12

11

12

18

12

13

10

12

16

16

20

18

25

18

17

14

12

19

18

17

19

17

18

20

17

14

14

16

16

15

19

15

27

22

17

16

18

16

15

18

15

13

16

13

12

18

20

18

18

15

12

10

15

16

41

18

14

16

12

17

17

14

13

16

12

11

13

11

9

15

10

15

14

16

18

15

15

22

21

16

15

14

13

11

14

17

17

12

18

18

24

15

14

13

17

15

18

17

17

13

15

19

18

14

16

15

14

18

11

14

11

20

13

18

16

12

15

10

10

16

16

22

15

18

17

17

11

12

14

15

16

13

19

22

20

19

16

16

13

18

17

16

17

16

10

13

16

18

15

16

15

14

15

16

15

16

15

15

11

15

17

14

8

15

16

14

18

15

14

16

17

19

16

16

13

13

12

17

10

19

18

10

15

14

13

21

17

16

10

17

13

11

20

15

18

15

11

22

15

16

14

21

13

12

25

24

10

14

18

19

16

12

14

10

20

20

18

15

11

17

15

15

13

10

13

15

15

16

19

19

20

12

13

18

16

22

14

10

10

20

14

16

12

14

19

13

13

22

18

16

11

14

9

19

19

11

17

19

16

16

22

15

16

18

21

16

15

16

21

13

18

18

18

20

18

14

12

10

9

19

12

10

15

10

15

14

16

10

10

15

16

16

14

17

17

12

10

16

18

11

17

17

11

15

20

11

13

11

11

15

17

18

27

16

16

12

13

19

13

17

16

17

10

14

15

19

14

20

12

18

13

16

28

18

18

12

14

20

9

10

18

13

17

13

20

20

13

15

13

15

18

19

17

19

21

16

10

15

15

17

13

16

13

14

16

14

21

23

17

13

17

16

14

21

20

13

16

15

16

10

9

15

12

13

11

12

14

16

13

14

12

16

23

15

10

18

23

13

14

11

9

11

10

20

17

17

13

18

13

19

12

10

12

13

13

18

18

16

15

16

22

17

10

20

17

18

17

18

24

21

16

16

18

10

17

12

14

12

21

15

14

23

18

11

14

13

17

15

16

15

13

17

23

18

19

14

13

18

10

10

17

17

15

18

19

16

17

12

15

11

12

10

14

9

10

12

12

21

13

16

9

22

17

13

15

23

9

15

17

16

11

12

19

12

10

16

18

13

16

12

15

17

10

25

18

22

17

17

12

16

10

17

8

11

13

17

10

10

26

12

16

15

13

16

15

19

15

12

9

17

17

10

15

14

11

10

19

17

18

12

15

14

20

15

14

19

17

16

18

14

19

16

18

17

18

22

9

17

16

20

15

9

14

14

14

10

13

16

15

16

19

21

11

15

21

11

17

19

23

12

12

17

18

15

14

12

19

19

9

17

25

13

13

11

13

14

11

17

15

19

18

26

19

19

16

18

15

18

12

16

11

17

16

23

14

29

16

16

14

14

12

10

16

20

9

11

14

12

18

16

13

10

9

19

14

13

14

13

15

21

13

18

15

9

9

9

16

19

16

17

20

16

11

14

11

19

18

17

11

20

11

13

17

16

12

18

15

11

17

7

5

6

5

7

7

6

11

10

7

8

7

8

7

9

8

7

6

10

11

9

11

7

9

10

7

7

9

7

11

11

9

8

10

7

10

14

10

10

8

13

8

13

12

8

15

10

11

11

16

12

10

10

9

12

13

11

10

12

9

11

12

9

7

12

11

9

11

9

12

14

13

12

9

14

7

15

13

18

12

16

15

13

8

9

11

11

9

10

11

12

11

11

11

14

15

13

14

10

13

12

12

13

12

15

13

11

9

14

15

16

11

13

16

10

14

10

10

15

19

13

14

15

13

14

14

15

17

14

14

13

12

15

14

15

12

11

15

11

15

13

13

16

11

12

14

12

15

10

13

8

16

13

10

14

20

9

10

10

13

12

13

15

13

10

16

13

16

13

11

12

14

10

15

15

13

11

12

15

15

8

9

13

18

12

10

16

9

21

12

9

10

15

15

15

16

11

13

15

15

12

9

15

11

14

20

16

17

14

10

14

15

15

10

13

11

16

14

15

10

13

10

12

15

11

11

16

11

14

14

14

12

13

11

16

9

12

16

15

10

10

15

15

11

13

13

13

15

13

13

11

10

13

16

13

17

10

11

17

15

17

14

11

9

14

14

16

10

16

14

15

11

15

9

14

16

13

16

10

13

20

10

11

13

12

11

12

14

13

10

12

12

16

11

11

20

17

12

11

13

18

11

13

10

14

11

15

12

16

12

11

12

11

11

11

8

8

19

17

10

11

14

18

18

10

13

13

12

14

16

13

13

13

16

10

13

14

16

12

16

11

14

18

12

10

17

19

13

14

10

13

11

11

16

12

14

10

14

14

14

13

13

10

13

9

11

10

12

11

11

15

12

13

14

11

11

15

10

12

11

17

17

18

13

13

17

13

15

16

13

10

10

13

13

16

19

11

10

13

13

13

11

10

15

16

11

12

16

7

17

15

15

11

13

12

15

14

12

17

13

15

16

16

16

15

12

16

16

15

16

13

14

13

13

11

12

15

11

9

15

14

11

11

20

9

14

15

12

20

14

12

13

10

15

11

15

10

11

14

11

17

12

17

15

9

10

11

11

14

14

14

10

13

8

10

16

14

11

15

14

18

13

14

9

18

14

14

8

13

12

15

14

14

16

13

12

16

18

12

13

14

14

11

13

15

11

11

15

10

10

13

13

14

17

13

16

11

15

14

14

14

10

16

14

14

11

12

11

14

11

16

12

11

17

19

13

14

16

13

9

13

14

9

14

10

14

14

14

13

14

14

12

15

17

16

10

13

15

10

17

14

11

12

17

10

17

12

14

14

14

14

11

9

11

14

16

17

17

11

11

9

16

13

11

12

16

15

15

12

13

15

13

16

16

18

13

15

20

16

12

16

19

17

16

16

14

14

14

15

8

17

15

17

16

12

13

15

14

10

16

14

16

15

13

14

12

18

14

14

11

13

13

10

15

11

15

13

8

12

20

18

12

12

15

14

10

14

18

14

10

12

17

11

12

9

12

14

17

14

11

14

16

15

14

13

12

11

15

14

9

17

13

13

10

14

14

11

13

10

14

11

16

13

17

12

12

12

11

12

15

13

13

11

12

16

12

16

13

16

15

13

15

19

13

15

15

16

14

16

14

15

12

13

15

14

11

19

13

12

16

16

16

15

12

11

12

12

16

14

17

11

11

15

16

12

16

12

13

11

13

12

12

13

17

15

11

20

16

14

11

16

13

17

14

13

16

14

21

14

19

13

15

11

12

13

14

13

13

16

8

13

11

19

21

10

12

16

10

11

15

10

14

15

13

10

16

13

14

7

9

11

12

18

14

16

12

8

14

11

9

9

11

14

13

16

14

16

14

13

12

13

12

13

18

14

15

13

17

16

23

12

7

13

15

11

15

13

15

13

15

14

12

14

16

12

13

18

14

12

10

16

14

13

15

12

16

14

17

9

11

11

15

12

15

9

12

15

11

14

13

11

12

19

10

16

14

13

12

11

12

19

13

18

14

12

15

20

14

13

17

14

13

15

8

10

8

13

14

16

10

22

14

15

13

10

10

12

14

14

19

16

19

12

10

15

14

12

10

18

13

16

8

12

13

10

13

11

9

15

10

10

12

15

16

13

10

17

11

16

13

19

9

14

13

13

16

12

18

11

11

15

10

14

10

17

11

15

18

16

17

14

12

14

15

18

18

15

13

15

9

12

10

10

11

11

15

17

15

15

10

9

14

13

16

12

10

11

11

11

15

13

15

18

14

13

15

14

13

13

14

12

17

14

14

18

13

14

15

13

12

15

15

14

16

11

18

11

16

16

15

17

17

11

18

13

11

14

11

14

16

13

18

15

10

14

15

15

15

12

17

11

16

15

14

14

18

16

11

14

14

12

17

11

9

14

15

12

22

14

15

11

15

12

13

14

12

15

9

15

14

16

16

8

16

13

14

17

18

13

15

16

14

11

18

19

16

12

17

15

11

21

15

19

17

13

18

17

15

12

15

14

12

14

13

12

12

13

17

17

15

12

11

16

18

15

14

10

16

13

26

16

22

11

15

14

12

19

18

18

13

17

17

12

18

15

16

15

16

14

11

16

16

13

18

15

15

16

14

11

15

16

20

16

14

18

14

19

14

14

13

17

12

16

16

20

18

16

19

11

13

14

12

15

11

15

17

19

15

16

18

15

17

18

12

11

11

24

15

15

15

11

12

16

20

12

16

17

14

14

13

15

14

10

16

15

15

12

16

11

18

19

12

14

17

14

17

19

16

10

11

14

18

16

16

18

22

11

8

17

10

15

13

14

15

14

14

16

14

17

14

19

13

14

13

13

13

15

18

17

13

10

13

17

16

15

12

15

12

17

11

14

9

11

17

13

11

13

13

13

16

14

14

17

17

14

12

15

18

17

15

17

12

11

14

13

11

16

19

18

15

14

10

13

11

10

19

16

11

15

16

11

14

16

13

15

19

14

17

21

11

13

10

11

16

17

15

12

10

13

19

13

17

27

15

17

16

20

11

15

11

15

17

13

18

13

18

17

12

11

11

12

15

15

12

16

13

16

11

15

17

14

17

17

13

13

10

18

11

13

15

20

18

20

16

16

11

14

17

12

13

18

14

14

17

17

19

16

11

19

15

15

11

21

16

18

13

12

15

13

15

15

18

16

14

11

17

15

19

12

12

8

11

13

15

11

17

12

11

15

11

18

12

16

13

19

16

12

14

15

19

18

13

12

12

17

16

11

11

14

15

18

14

10

17

12

16

17

20

13

17

14

17

16

16

14

16

11

13

13

15

15

15

12

14

16

16

13

14

17

14

11

17

14

14

11

12

13

15

9

16

13

18

13

20

15

13

19

11

13

15

15

15

20

15

13

16

14

18

14

17

17

23

14

15

10

10

17

19

15

12

12

14

13

15

12

11

16

15

12

11

16

14

15

11

17

13

22

12

10

14

15

18

17

13

17

16

23

9

10

17

12

14

14

11

14

15

17

14

13

12

14

14

13

18

15

17

16

16

17

18

12

15

14

12

19

19

12

16

20

14

19

15

13

16

17

13

16

14

9

21

16

16

11

10

17

13

16

15

17

20

13

15

16

16

19

10

12

14

14

16

11

11

11

15

13

16

14

15

14

11

10

16

13

14

17

12

21

17

20

17

13

11

17

16

19

11

16

18

13

17

11

12

13

13

19

12

15

21

13

16

16

11

13

11

15

16

15

11

16

12

14

10

21

13

11

19

16

10

13

14

17

17

15

12

16

15

12

12

17

14

17

18

10

21

14

12

18

16

16

19

22

14

15

14

8

8

12

15

18

14

18

14

13

17

17

13

19

16

15

14

17

15

11

10

15

15

10

18

12

15

16

18

16

21

11

14

13

16

10

20

11

16

19

12

18

14

16

19

15

15

22

14

12

14

10

17

12

15

21

14

13

16

14

14

16

16

12

17

13

17

20

15

14

13

9

12

19

14

15

12

8

11

12

17

15

17

13

13

17

14

14

14

16

14

14

11

11

14

13

17

13

14

12

19

15

22

16

23

21

19

19

16

17

21

14

15

18

14

13

18

14

16

16

12

17

18

15

17

15

14

16

15

13

11

18

13

14

12

21

13

12

15

13

13

16

17

11

17

15

10

17

12

18

19

16

13

18

8

17

13

14

11

11

12

15

13

11

13

16

15

14

17

14

15

24

19

19

11

15

11

10

19

14

12

17

13

15

11

12

14

16

12

14

16

18

15

17

13

12

11

19

15

11

14

10

16

13

18

16

12

16

16

14

14

16

11

14

13

15

17

16

15

11

16

18

18

15

16

15

11

11

13

12

18

19

17

18

14

15

11

15

12

15

16

18

19

17

17

11

13

19

16

15

12

12

12

16

13

15

15

15

14

17

9

8

11

23

12

12

12

17

17

15

17

18

14

20

17

15

15

14

18

11

13

10

16

15

18

17

14

16

18

16

11

15

14

15

11

16

21

20

11

17

17

12

12

18

17

18

15

17

17

17

16

12

13

17

11

18

12

15

15

15

19

12

19

15

17

14

16

17

11

12

12

12

16

10

18

12

13

15

14

16

14

13

13

15

10

14

12

18

20

13

17

16

14

17

18

17

12

13

8

15

15

18

12

17

18

18

18

18

14

13

12

16

13

17

11

14

17

14

13

15

13

18

14

13

13

8

13

16

18

12

13

17

14

19

11

12

14

15

9

11

12

17

19

10

14

13

17

15

16

17

16

17

8

18

14

18

15

20

16

17

16

16

16

17

16

15

14

11

15

15

12

16

16

16

15

15

15

17

14

18

14

16

18

18

11

14

14

16

14

12

15

10

14

16

11

12

14

16

14

19

17

12

16

13

20

11

9

14

13

20

13

17

17

18

15

14

13

11

11

17

16

14

12

17

14

16

15

15

16

13

18

16

14

18

16

10

17

18

17

16

14

14

16

15

11

20

14

19

12

16

12

19

13

15

15

11

19

12

15

16

15

15

18

15

19

15

8

18

19

17

17

20

21

10

11

13

12

13

14

13

19

14

18

14

21

18

12

15

12

13

8

16

15

14

15

15

18

16

16

8

15

13

8

20

15

15

16

19

11

12

15

13

16

12

14

12

19

19

13

16

17

12

20

16

20

16

14

19

11

16

17

12

16

14

18

11

16

12

16

13

15

13

14

16

17

17

15

12

10

17

14

17

16

15

8

16

14

17

20

16

12

14

8

14

13

14

17

16

15

12

13

12

16

12

15

17

15

15

14

12

17

16

14

12

13

11

14

12

13

17

14

18

16

22

17

16

15

16

17

14

14

18

18

11

17

18

13

18

17

19

11

19

17

15

13

10

11

10

20

15

15

12

21

19

17

18

10

15

19

15

15

12

8

19

15

10

16

17

19

14

17

8

18

9

19

19

18

15

15

12

18

15

17

14

21

12

11

12

13

15

13

11

16

10

16

16

13

17

16

12

13

13

14

14

14

17

9

14

10

14

19

17

17

17

16

12

13

14

12

10

12

20

13

16

16

14

17

15

12

14

12

15

13

15

14

17

13

14

14

18

16

13

14

19

13

18

15

17

17

17

13

22

20

10

15

15

12

18

15

18

20

16

15

11

16

15

17

20

16

14

17

17

19

13

21

19

14

13

14

8

12

15

14

14

13

18

12

11

17

15

17

18

12

16

13

8

10

14

11

12

16

21

12

16

15

11

13

12

19

19

17

17

12

15

14

18

11

12

22

11

14

11

19

14

17

14

8

15

18

18

17

18

14

10

14

14

20

13

21

17

12

14

19

15

21

15

17

11

20

18

17

15

15

12

15

17

16

13

15

18

15

9

21

11

15

12

15

13

18

18

15

14

16

21

17

14

14

17

16

18

15

13

12

17

11

19

13

9

15

13

11

16

9

13

13

16

18

17

16

15

14

15

12

11

10

17

10

11

13

9

9

16

20

17

19

15

17

14

21

19

15

17

11

16

15

12

19

14

18

14

15

12

14

14

20

14

18

18

17

12

14

16

17

10

11

12

12

11

15

15

16

14

12

18

15

14

12

14

17

13

14

11

17

16

15

13

22

13

12

15

18

12

17

16

15

18

13

16

17

17

15

13

8

15

16

13

13

14

16

19

9

11

10

14

18

19

12

11

16

13

13

12

10

14

13

19

18

19

8

20

15

17

12

17

11

16

14

13

16

14

22

19

15

14

13

16

14

18

15

8

12

14

20

12

12

23

13

15

18

18

15

15

16

12

17

17

19

11

16

13

15

11

13

17

15

8

16

15

15

15

14

17

16

14

13

9

15

18

15

15

19

14

15

13

12

18

18

14

14

16

21

14

14

15

16

15

15

15

16

15

16

16

18

18

16

20

22

16

11

15

11

14

17

12

19

16

15

18

21

15

16

14

8

20

14

19

15

13

13

15

15

17

20

12

10

12

15

15

11

12

13

20

17

18

20

17

11

11

11

15

15

13

19

13

16

15

18

16

14

13

14

9

14

18

15

15

9

19

13

17

14

24

15

18

19

19

13

15

11

12

12

10

12

13

17

16

13

14

14

17

17

14

16

16

19

15

13

16

16

22

15

14

16

16

14

13

16

15

19

17

18

15

11

16

14

16

14

10

17

17

9

19

15

18

16

13

19

15

11

16

18

19

17

19

13

18

11

14

15

14

20

17

13

17

14

19

12

18

12

8

18

17

13

18

16

14

14

15

15

12

14

11

14

15

15

15

14

11

19

16

18

12

16

10

16

19

13

13

18

16

14

16

18

22

12

17

12

15

19

27

14

14

14

16

12

14

12

15

17

12

13

15

15

15

16

13

16

11

13

18

11

11

16

14

14

13

14

18

11

16

9

15

13

14

18

14

12

15

15

14

18

17

16

20

15

13

13

13

13

14

15

14

16

18

11

14

15

13

12

18

13

21

18

17

10

15

12

9

19

13

15

14

13

13

12

16

14

11

16

11

19

11

18

17

14

12

18

17

18

15

17

14

15

16

20

14

10

19

14

11

8

17

15

15

15

16

16

11

16

14

14

17

14

21

16

20

10

17

19

14

17

17

19

17

18

11

11

14

15

13

16

19

14

13

21

14

13

9

15

12

10

13

17

10

17

11

11

10

11

15

18

14

14

13

17

9

10

12

18

16

11

16

15

16

17

15

19

10

18

18

11

11

16

14

15

18

16

16

20

12

16

13

17

15

15

15

18

18

15

9

16

15

12

8

18

17

20

18

19

19

17

17

16

17

13

19

20

15

19

17

13

13

13

10

15

15

14

17

10

14

11

16

12

14

20

12

16

15

14

19

16

12

16

14

16

14

16

14

18

17

18

13

12

14

21

12

12

16

19

18

13

16

19

17

11

15

16

17

12

16

14

13

11

10

15

13

12

19

16

13

8

9

15

15

10

15

18

16

16

15

20

15

13

19

12

9

17

14

14

16

15

16

16

16

16

17

14

13

15

16

16

14

13

15

18

13

13

16

17

12

18

14

14

11

19

12

11

14

15

11

14

17

19

22

13

18

16

14

12

15

13

18

20

17

13

11

18

11

18

13

19

14

13

13

12

13

10

12

13

9

13

15

11

17

16

17

12

15

12

18

14

16

14

16

11

17

14

14

18

14

18

19

14

16

8

17

15

20

9

15

15

10

14

13

17

16

18

16

9

14

15

17

11

19

11

13

16

15

14

15

13

17

12

16

13

17

14

17

14

13

19

11

12

21

9

17

17

17

15

18

16

19

16

11

12

17

12

14

16

19

15

19

18

20

18

19

14

15

14

11

12

13

12

14

13

11

12

12

18

10

9

16

16

17

18

14

16

12

13

19

11

15

14

15

17

16

15

15

18

18

19

17

12

20

13

15

11

10

13

19

10

10

13

13

17

12

21

13

19

12

12

13

15

17

14

14

12

14

15

13

18

12

10

9

14

13

16

19

13

13

16

8

17

20

20

13

16

17

16

12

8

16

16

14

16

13

12

20

14

11

16

18

13

16

14

13

12

20

18

13

17

14

15

12

19

16

19

14

17

12

18

15

17

14

17

11

17

23

19

20

11

14

17

17

16

14

17

25

20

18

16

13

15

16

13

13

15

17

14

21

9

15

19

10

11

10

18

17

10

12

18

14

14

15

12

17

14

20

17

14

17

10

15

12

13

18

14

16

13

17

19

16

10

14

14

14

11

15

18

15

22

20

16

15

16

13

14

16

21

22

18

14

10

19

15

10

14

14

14

12

16

16

18

17

17

13

18

17

20

12

11

16

13

16

9

12

16

13

9

23

9

18

19

12

13

13

21

16

11

11

17

16

10

20

18

14

13

22

18

9

19

14

17

13

10

15

14

17

13

10

19

18

15

18

14

17

17

17

13

11

17

15

17

17

19

16

13

15

15

22

14

15

13

17

13

18

19

13

15

21

22

19

20

16

14

13

9

12

15

19

14

12

13

10

16

16

18

17

15

17

17

12

11

14

17

18

15

14

16

16

14

14

14

14

11

13

18

13

10

9

12

16

13

16

16

13

12

18

11

19

14

14

16

11

18

13

14

11

17

14

18

15

18

16

14

16

16

17

22

15

11

17

16

17

16

17

16

10

14

10

14

14

13

17

13

16

13

18

17

14

17

14

18

13

16

16

16

17

15

11

14

11

14

16

14

17

14

14

19

16

11

13

18

16

15

17

11

13

18

21

18

14

19

15

12

12

16

16

14

19

17

17

20

20

14

27

12

17

8

15

17

15

19

18

19

18

17

12

14

19

18

8

10

12

14

14

20

13

14

11

10

18

19

9

9

19

16

18

12

14

14

9

10

10

18

19

13

9

11

13

15

16

19

18

11

10

16

22

18

12

18

15

14

14

10

13

18

15

13

11

12

9

16

16

10

15

19

13

14

14

17

19

16

14

17

14

14

18

20

18

15

15

13

22

16

10

16

9

11

16

15

18

15

16

19

15

20

15

18

15

17

13

16

13

13

12

16

13

11

11

14

15

16

20

17

13

10

18

15

16

17

15

18

18

16

15

12

19

13

16

17

12

12

14

16

12

16

15

21

10

11

16

17

14

12

11

14

15

17

16

16

17

17

15

20

13

14

20

14

13

16

17

20

14

17

12

11

17

14

20

12

16

16

13

15

16

11

10

9

10

15

13

18

18

12

11

13

22

11

10

15

15

13

10

14

14

12

15

12

15

16

18

20

13

13

19

9

17

13

13

15

15

15

12

16

17

14

21

15

15

17

17

14

16

12

19

14

14

14

15

9

16

23

19

11

17

21

16

21

19

13

15

17

13

14

13

11

17

15

13

12

13

16

14

12

22

16

11

26

11

12

15

14

20

14

14

22

11

16

15

18

15

16

19

9

13

12

18

19

12

18

12

14

15

22

13

15

9

21

16

16

13

13

13

17

9

9

12

12

18

16

13

18

13

19

18

14

24

16

17

17

12

18

11

14

12

18

18

16

21

15

13

19

21

10

12

14

10

16

12

13

15

13

13

14

15

10

13

16

9

14

19

10

11

12

16

15

15

16

18

16

14

13

13

13

12

16

15

18

13

14

17

16

14

15

11

12

16

11

19

11

15

11

11

11

16

13

20

15

11

16

14

18

10

14

12

11

15

12

17

17

14

10

12

14

17

18

18

20

17

16

19

13

17

8

13

12

17

14

15

11

10

19

13

15

15

24

10

16

19

15

15

10

13

14

14

12

12

14

14

19

16

16

15

13

20

12

21

16

13

21

19

20

10

19

15

13

11

16

12

16

16

20

14

17

14

16

18

15

14

18

13

18

14

14

18

15

15

16

13

19

15

22

17

18

18

11

14

15

19

17

9

15

15

15

18

18

20

15

13

16

14

19

14

12

22

11

13

18

18

16

13

20

15

13

15

19

12

15

12

17

11

17

15

17

13

11

16

11

13

13

15

15

8

8

15

16

11

13

14

22

17

23

10

17

19

20

11

15

9

20

9

16

16

12

24

21

15

19

18

18

11

16

17

17

10

12

17

13

10

16

14

13

16

16

13

16

11

18

14

14

14

16

18

17

15

12

15

15

10

20

19

16

15

18

17

17

19

18

17

16

12

15

15

15

14

14

10

13

22

16

13

16

16

12

17

12

21

18

14

11

25

15

19

19

15

9

9

17

14

10

14

14

14

13

14

12

16

10

16

18

13

11

17

14

12

11

18

12

12

20

16

19

13

15

13

17

14

13

10

16

17

16

13

14

15

12

12

18

9

15

16

17

13

13

13

12

10

18

10

18

22

16

11

11

15

13

15

13

15

16

13

16

11

22

15

13

11

18

16

17

15

9

13

9

9

11

15

19

13

18

16

16

12

10

15

10

17

16

21

21

15

12

14

14

17

13

13

22

16

15

17

17

18

19

10

16

13

17

12

14

20

10

9

17

12

13

13

15

18

18

13

15

20

14

15

15

10

19

9

14

13

15

17

17

10

17

15

13

14

21

13

13

10

11

15

9

22

15

11

14

19

18

11

16

10

10

15

13

23

16

24

15

12

16

12

11

14

16

21

15

15

16

15

12

23

14

15

18

18

12

10

15

15

11

9

19

16

10

11

14

21

16

16

16

17

18

15

31

9

16

21

13

23

11

18

18

17

16

17

16

19

12

22

16

21

10

15

16

14

11

14

19

9

13

26

13

11

15

10

14

14

13

12

11

13

13

15

16

16

11

17

16

13

11

16

14

13

17

21

13

17

14

15

15

13

12

14

15

18

14

11

9

8

15

22

17

15

14

18

12

13

17

11

17

12

15

14

15

20

13

13

11

18

16

16

14

18

8

16

11

19

16

12

15

13

17

18

17

15

17

16

18

14

15

9

10

12

16

14

14

15

14

16

15

16

20

11

15

17

18

18

15

21

16

17

17

14

18

18

17

11

15

18

16

13

15

12

13

10

16

14

10

16

12

14

14

14

11

17

13

17

14

22

16

11

11

22

15

22

20

15

17

19

16

16

16

16

15

16

11

18

8

16

20

14

13

11

15

15

16

12

14

13

15

17

18

14

15

13

11

21

20

10

14

17

15

12

19

19

12

13

16

10

16

20

14

13

14

17

10

10

16

19

18

16

13

9

17

14

16

17

13

16

16

16

19

13

18

18

14

14

17

15

15

10

9

12

18

13

11

16

15

13

13

13

17

14

14

10

16

19

19

19

14

13

14

24

10

9

10

14

19

12

16

13

18

13

15

10

14

20

14

15

15

14

14

14

13

15

19

14

12

10

12

16

12

12

13

12

14

13

20

14

9

16

16

10

17

16

14

14

14

12

15

11

11

11

18

11

12

18

18

17

17

10

16

11

11

20

20

12

13

12

15

13

17

14

22

17

23

14

16

15

17

19

12

17

11

16

23

12

18

16

17

10

11

11

14

14

15

12

27

14

11

14

15

11

21

12

17

19

12

16

16

15

13

15

15

13

16

14

13

12

14

17

18

15

15

13

12

18

14

18

16

14

15

21

13

13

21

11

11

13

21

11

16

19

23

15

17

15

18

14

14

19

10

17

13

15

20

9

14

20

14

19

14

17

17

19

18

14

14

15

20

16

16

16

14

10

16

17

14

16

10

17

15

19

15

17

17

15

17

15

13

15

21

17

13

16

14

17

17

18

16

12

17

18

17

14

21

11

20

19

10

17

13

16

12

14

13

13

15

18

18

14

14

17

16

14

15

19

14

19

14

15

15

16

12

17

16

19

15

22

16

17

14

19

17

10

20

10

18

21

18

16

17

16

20

15

18

12

16

19

13

10

13

13

14

12

15

18

12

17

13

13

14

15

11

16

11

17

15

18

16

15

9

15

18

24

15

15

17

18

11

11

11

10

19

9

17

19

10

17

17

20

21

20

18

16

18

15

15

12

14

16

14

13

13

15

12

21

10

15

15

20

11

11

19

11

15

16

10

19

20

14

15

12

9

9

9

23

14

16

10

17

15

17

10

9

17

25

13

20

26

19

14

19

19

19

14

16

13

16

15

15

15

14

18

18

11

20

14

14

16

15

16

18

15

14

17

18

12

16

22

19

12

17

12

13

10

12

12

8

8

12

14

14

17

11

13

13

12

16

14

9

26

19

15

16

14

22

14

13

17

17

22

10

14

20

16

11

15

16

16

10

15

15

16

17

14

17

13

24

18

16

12

12

12

18

14

17

14

14

19

15

13

13

16

15

13

15

15

12

13

19

15

23

16

17

15

19

22

13

16

18

27

16

12

18

23

17

15

13

13

13

13

15

12

16

16

16

17

12

19

17

13

11

14

18

17

13

10

16

18

21

16

14

15

12

12

19

18

12

15

11

13

17

18

14

16

16

12

22

15

14

15

18

13

15

12

18

17

15

16

16

14

15

16

16

19

20

13

14

15

12

21

20

16

13

16

10

16

21

11

21

26

18

9

20

13

15

14

20

16

17

17

19

11

12

16

11

18

11

15

15

12

17

16

13

15

16

12

11

12

18

12

13

10

12

16

16

20

18

25

18

17

14

12

19

18

17

19

17

18

20

17

14

14

16

16

15

19

15

27

22

17

16

18

16

15

18

15

13

16

13

12

18

20

18

18

15

12

10

15

16

41

18

14

16

12

17

17

14

13

16

12

11

13

11

9

15

10

15

14

16

18

15

15

22

21

16

15

14

13

11

14

17

17

12

18

18

24

15

14

13

17

15

18

17

17

13

15

19

18

14

16

15

14

18

11

14

11

20

13

18

16

12

15

10

10

16

16

22

15

18

17

17

11

12

14

15

16

13

19

22

20

19

16

16

13

18

17

16

17

16

10

13

16

18

15

16

15

14

15

16

15

16

15

15

11

15

17

14

8

15

16

14

18

15

14

16

17

19

16

16

13

13

12

17

10

19

18

10

15

14

13

21

17

16

10

17

13

11

20

15

18

15

11

22

15

16

14

21

13

12

25

24

10

14

18

19

16

12

14

10

20

20

18

15

11

17

15

15

13

10

13

15

15

16

19

19

20

12

13

18

16

22

14

10

10

20

14

16

12

14

19

13

13

22

18

16

11

14

9

19

19

11

17

19

16

16

22

15

16

18

21

16

15

16

21

13

18

18

18

20

18

14

12

10

9

19

12

10

15

10

15

14

16

10

10

15

16

16

14

17

17

12

10

16

18

11

17

17

11

15

20

11

13

11

11

15

17

18

27

16

16

12

13

19

13

17

16

17

10

14

15

19

14

20

12

18

13

16

28

18

18

12

14

20

9

10

18

13

17

13

20

20

13

15

13

15

18

19

17

19

21

16

10

15

15

17

13

16

13

14

16

14

21

23

17

13

17

16

14

21

20

13

16

15

16

10

9

15

12

13

11

12

14

16

13

14

12

16

23

15

10

18

23

13

14

11

9

11

10

20

17

17

13

18

13

19

12

10

12

13

13

18

18

16

15

16

22

17

10

20

17

18

17

18

24

21

16

16

18

10

17

12

14

12

21

15

14

23

18

11

14

13

17

15

16

15

13

17

23

18

19

14

13

18

10

10

17

17

15

18

19

16

17

12

15

11

12

10

14

9

10

12

12

21

13

16

9

22

17

13

15

23

9

15

17

16

11

12

19

12

10

16

18

13

16

12

15

17

10

25

18

22

17

17

12

16

10

17

8

11

13

17

10

10

26

12

16

15

13

16

15

19

15

12

9

17

17

10

15

14

11

10

19

17

18

12

15

14

20

15

14

19

17

16

18

14

19

16

18

17

18

22

9

17

16

20

15

9

14

14

14

10

13

16

15

16

19

21

11

15

21

11

17

19

23

12

12

17

18

15

14

12

19

19

9

17

25

13

13

11

13

14

11

17

15

19

18

26

19

19

16

18

15

18

12

16

11

17

16

23

14

29

16

16

14

14

12

10

16

20

9

11

14

12

18

16

13

10

9

19

14

13

14

13

15

21

13

18

15

9

9

9

16

19

16

17

20

16

11

14

11

19

18

17

11

20

11

13

17

16

12

18

15

11

17

In [38]:
from sklearn.utils import shuffle


def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=TRAIN_FRACTION):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc
    gc.enable()
    del features, labels, train_features, valid_features, train_labels, valid_labels
    gc.collect()

    return X_train, X_valid, y_train, y_valid

In [32]:
import os
from keras.utils import get_file

# Vectors to use
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.zip'

# Download word embeddings if they are not present
if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip',
                             'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')

# Load in unzipped file
glove_vectors = '/home/jwq/.keras/datasets/glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape
glove[0]

(400000, 101)

array(['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172',
       '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459',
       '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231',
       '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336',
       '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971',
       '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722',
       '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397',
       '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531',
       '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477',
       '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205',
       '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167',
       '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079',
       '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044',
       '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972',
       '0.15006', '-0.53

In [33]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

del glove

vectors[100], words[100]

(array([-3.9551e-01,  5.4660e-01,  5.0315e-01, -6.3682e-01, -4.5470e-01,
         3.0889e-01, -4.9240e-02,  2.7191e-01,  3.1562e-01, -3.2879e-01,
         2.5089e-01,  1.4508e-01,  3.5136e-01, -2.2793e-01, -1.5894e-01,
        -5.1527e-01, -2.7978e-01,  3.6470e-01, -3.9425e-01,  3.3299e-01,
         4.3051e-01,  1.8300e-01,  2.5095e-01, -1.8547e-01,  3.4698e-01,
         5.5137e-02, -4.5979e-01, -8.2963e-01, -1.8523e-02, -3.6772e-01,
         4.5566e-02,  7.1052e-01, -2.2782e-02, -8.0889e-02,  2.0685e-01,
         4.9855e-01, -5.9794e-02, -8.0048e-03, -2.3823e-01, -3.3759e-01,
        -2.4201e-01, -2.3788e-01, -1.1362e-03, -4.0395e-01, -4.4859e-01,
        -3.2189e-01,  4.8405e-01, -2.7999e-02,  1.0148e-01, -9.3585e-01,
        -8.7522e-02, -3.9959e-01,  3.6545e-01,  1.3726e+00, -3.0713e-01,
        -2.5940e+00,  2.2431e-01, -4.1168e-02,  1.7765e+00,  4.0010e-01,
        -1.0996e-01,  1.4178e+00, -2.6154e-01,  1.8617e-01,  7.9328e-01,
        -1.1709e-01,  8.7541e-01,  4.3911e-01,  3.4

In [69]:
for i in range(1,10):
    print(idx_word[i])

the
a
of
.
,
and
to
network
neural


In [73]:
print(word_idx.keys())



In [74]:
word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((dict_size, vectors.shape[1]))

not_found = 0


for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    if i < dict_size:
        vector = word_lookup.get(word, None)

        # Record in matrix
        if vector is not None:
            embedding_matrix[i + 1, :] = vector
        else:
            not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

There were 936 words without pre-trained embeddings.


In [78]:
embedding_matrix = np.zeros((dict_size, len(word_lookup['the'])))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    
    if i < dict_size:
        # Look up the word embedding
        vector = word_lookup.get(word, None)

        # Record in matrix
        if vector is not None:
            embedding_matrix[i + 1, :] = vector
        else:
            not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')
embedding_matrix.shape

print(embedding_matrix[0])
print(embedding_matrix[1])
print(embedding_matrix[-1])
print(embedding_matrix[-100])


There were 936 words without pre-trained embeddings.


(6144, 100)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.2

In [81]:
# Split into training and validation
X_train, X_valid, y_train, y_valid = create_train_valid(
    features, labels, dict_size)
X_train.shape, y_train.shape

((212181, 50), (212181, 6144))

In [83]:
import sys
sys.getsizeof(y_train) / 1e9

def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')

check_sizes(gb_min=1)

1.303640176

Object: y_train   	Size: 1.303640176 GB.


In [85]:
# model
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional, SimpleRNN
from keras.optimizers import Adam

from keras.utils import plot_model

def make_word_level_model(num_words,
                          embedding_matrix,
                          rnn_cells=128,
                          trainable=True,
                          rnn_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of RNN cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple RNN layers
    if rnn_layers > 1:
        for i in range(rnn_layers - 1):
            model.add(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final RNN cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                SimpleRNN(
                    rnn_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            SimpleRNN(
                rnn_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
#     model.add(Dense(128, activation='relu'))
#     # Dropout for regularization
#     model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model


model = make_word_level_model(
    dict_size,
    embedding_matrix=embedding_matrix,
    rnn_cells=RNN_CELLS,
    trainable=True,
    rnn_layers=1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         614400    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 128)               29312     
_________________________________________________________________
dense_3 (Dense)              (None, 6144)              792576    
Total params: 1,436,288
Trainable params: 1,436,288
Non-trainable params: 0
_________________________________________________________________


In [90]:
!pip install pydot



In [91]:
from IPython.display import Image
model_name = 'train-embeddings-rnn-50'
model_dir = '../my_models/'

plot_model(model, to_file=f'{model_dir}{model_name}.png', show_shapes=True)

Image(f'{model_dir}{model_name}.png')

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [92]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

BATCH_SIZE = 2048


def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [93]:
model_name = 'train-embeddings-rnn-50-6144'
callbacks = make_callbacks(model_name)

In [94]:
model.compile(
    optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [96]:
!mkdir ../my_models/

In [98]:
history = model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
    epochs=EPOCHS,
    callbacks=callbacks,
    validation_data=(X_valid, y_valid))

# Evaluate

In [102]:
model_name = 'train-embeddings-rnn-50-6144'
model_dir = './my_models/'
model = load_model(f'{model_dir}{model_name}.h5')

In [103]:
def load_and_evaluate(model_name, return_model=False):
    """Load in a trained model and evaluate with log loss and accuracy"""

    model = load_model(f'{model_dir}{model_name}.h5')
    r = model.evaluate(X_valid, y_valid, batch_size=2048, verbose=1)

    valid_crossentropy = r[0]
    valid_accuracy = r[1]

    print(f'Cross Entropy: {round(valid_crossentropy, 4)}')
    print(f'Accuracy: {round(100 * valid_accuracy, 2)}%')

    if return_model:
        return model

In [104]:
model = load_and_evaluate(model_name, return_model=True)

Cross Entropy: 4.3279
Accuracy: 28.99%


In [None]:
def generate_next(model,
                    sequence,
                    ans,
                    training_length=50,
                    new_words=1,
                    diversity=1,
                    return_output=False,
                    return_idx=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""
    """
    sequence: a tokenized sentence, e.g.[102,3,2314,3,...], with a length of training_length
    ans: the real next word
    """
    # Choose a random sequence
    seq = list(sequence)
    seed_idx = 0
    end_idx = training_length
    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
#         print(seed.shape, type(seed), type(['#']))
        generated = seed[:] + ['#']

        # Find the actual entire sequence
#         actual = generated[:] + seq[end_idx:end_idx + new_words]
        actual = generated[:] + [ans]
    
        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)
#             print(preds.shape, "\n", preds)
            # Diversify
#             preds = np.log(preds) / diversity
#             exp_preds = np.exp(preds)
#             print(exp_preds.shape, exp_preds)

            # Softmax
#             preds = exp_preds / sum(exp_preds)

            # Choose the next word
#             probas = np.random.multinomial(1, preds, 1)[0]

#             next_idx = np.argmax(probas)
            next_idx = np.argmax(preds)

            # New seed adds on old word
            seed = seed[1:] + [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        if return_idx == False:
            return original_sequence, gen_list, a
        else:
            return [word_idx[ele] for ele in original_sequence], word_idx[gen_list[0][1]], word_idx[a[1]]
    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html

In [None]:
def generate_text(model, text_len=100, new_words=1, return_output=True, return_idx=True):
    
    # Choose a random sequence
    seq = list()
    start_idx = 0
    gen_list = []

    # Extract the seed sequence
    seed = seq[seed_idx:end_idx]
    original_sequence = [idx_word[i] for i in seed]
    generated = seed[:] + ['#']

    actual = generated[:] + [ans]

    # Keep adding new words
    for i in range(new_words):

        # Make a prediction from the seed
        preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
            np.float64)
#             next_idx = np.argmax(probas)
        next_idx = np.argmax(preds)

        # New seed adds on old word
        seed = seed[1:] + [next_idx]
        generated.append(next_idx)

    # Showing generated and actual abstract
    n = []

    for i in generated:
        n.append(idx_word.get(i, '< --- >'))

    gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[training_length:]

    gen_list = [
        gen[training_length:training_length + len(a)] for gen in gen_list
    ]

    if return_output:
        if return_idx == False:
            return original_sequence, gen_list, a
        else:
            return [word_idx[ele] for ele in original_sequence], word_idx[gen_list[0][1]], word_idx[a[1]]
    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html