https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470

# Using a Recurrent Neural Network to write Patent Abstracts

In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
drive.mount('/content/drive')

Mounted at /gdrive
Mounted at /content/drive


In [3]:
%load_ext autoreload
%autoreload 2

In [2]:
patent = '/content/drive/MyDrive/Data/RNN_example/neural_network_patent_query.csv'
pretrain = '/content/drive/MyDrive/Data/RNN_example/pre-trained-rnn.h5'
embedding1 = '/content/drive/MyDrive/Data/RNN_example/train-embeddings-rnn.h5'
embedding2 = '/content/drive/MyDrive/Data/RNN_example/train-embeddings-rnn-2-layers.h5'

- 활용 함수 생성 

In [24]:
%%writefile utils.py
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.utils import shuffle

from IPython.display import HTML

from itertools import chain
from tensorflow.keras.utils import plot_model
import numpy as np
import pandas as pd
import random
import json
import re

RANDOM_STATE = 50
TRAIN_FRACTION = 0.7

def get_data(file, filters='!"%;[\\]^_`{|}~\t\n', training_len=50,
             lower=False):
    """Retrieve formatted training and validation data from a file"""
    
    data = pd.read_csv(file, parse_dates=['patent_date']).dropna(subset = ['patent_abstract'])
    abstracts = [format_sequence(a) for a in list(data['patent_abstract'])]
    word_idx, idx_word, num_words, word_counts, texts, sequences, features, labels = make_sequences(
        abstracts, training_len, lower, filters)
    X_train, X_valid, y_train, y_valid = create_train_valid(features, labels, num_words)
    training_dict = {'X_train': X_train, 'X_valid': X_valid, 
                     'y_train': y_train, 'y_valid': y_valid}
    return training_dict, word_idx, idx_word, sequences

# Data Preparation- Tokenizer 
def make_sequences(texts, training_length = 50,
                   lower = True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""
    
    # Create the tokenizer object 
    tokenizer = Tokenizer(lower=lower, filters=filters)
    # Train on texts
    tokenizer.fit_on_texts(texts)
    
    # Create look-up dictionaries and reverse look-ups (int -> string)
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts
    
    print(f'There are {num_words} unique words.')
    
    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [i for i, l in enumerate(seq_lengths) if l > (training_length + 20)]
    
    new_texts = []
    new_sequences = []
    
    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])
        
    features = []
    labels = []
    
    # Iterate through the sequences of tokens
    for seq in new_sequences:
        
        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length: i + 1]
            
            # Set the features and label
            features.append(extract[:-1])
            labels.append(extract[-1])
    
    print(f'There are {len(features)} sequences.')
    
    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, features, labels


# Features and Labels 
def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=0.7):
    """Create training and validation features and labels."""
    
    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    # Neural Network is able to train most effectively when the labels are one-hot encoded
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc
    gc.enable()
    del features, labels, train_features, valid_features, train_labels, valid_labels
    gc.collect()

    return X_train, X_valid, y_train, y_valid

# Modeling 
def get_model(model_name):
    """Retrieve a Keras model and embeddings"""
    model = load_model(f'../models/{model_name}.h5')
    embeddings = model.get_layer(index = 0)
    embeddings = embeddings.get_weights()[0]
    embeddings = embeddings / np.linalg.norm(embeddings, axis = 1).reshape((-1, 1))
    embeddings = np.nan_to_num(embeddings)
    word_idx = []
    with open(f'../data/training-rnn.json', 'rb') as f:
        for l in f:
            word_idx.append(json.loads(l))
        
    word_idx = word_idx[0]
    word_idx['UNK'] = 0
    idx_word = {index: word for word, index in word_idx.items()}
    return model, embeddings, word_idx, idx_word

def get_embeddings(model):
    """Retrieve the embeddings in a model"""
    embeddings = model.get_layer(index = 0)
    embeddings = embeddings.get_weights()[0]
    embeddings = embeddings / np.linalg.norm(embeddings, axis = 1).reshape((-1, 1))
    embeddings = np.nan_to_num(embeddings)
    return embeddings
    
def find_closest(query, embedding_matrix, word_idx, idx_word, n = 10):
    """Find closest words to a query word in embeddings"""
    
    idx = word_idx.get(query, None)
    # Handle case where query is not in vocab
    if idx is None:
        print(f'{query} not found in vocab.')
        return
    else:
        vec = embedding_matrix[idx]
        # Handle case where word doesn't have an embedding
        if np.all(vec == 0):
            print(f'{query} has no pre-trained embedding.')
            return
        else:
            # Calculate distance between vector and all others
            dists = np.dot(embedding_matrix, vec)
            
            # Sort indexes in reverse order
            idxs = np.argsort(dists)[::-1][:n]
            sorted_dists = dists[idxs]
            closest = [idx_word[i] for i in idxs]
            
    print(f'Query: {query}\n')
    # Print out the word and cosine distances
    for word, dist in zip(closest, sorted_dists):
        print(f'Word: {word:15} Cosine Similarity: {round(dist, 4)}')
        
def format_sequence(s):
    """Add spaces around punctuation and remove references to images/citations."""
    
    # Add spaces around punctuation
    s =  re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', s)
    
    # Remove references to figures
    s = re.sub(r'\((\d+)\)', r'', s)
    
    # Remove double spaces
    s = re.sub(r'\s\s', ' ', s)
    return s

def remove_spaces(s):
    """Remove spaces around punctuation"""
    s = re.sub(r'\s+([.,;?])', r'\1', s)
    
    return s


def generate_output(model,
                    sequences,
                    idx_word,
                    seed_length=50,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Choose a random sequence
    seq = random.choice(sequences)

    # Choose a random starting point
    seed_idx = random.randint(0, len(seq) - seed_length - 10)
    # Ending index for seed
    end_idx = seed_idx + seed_length

    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
        original_sequence = [idx_word[i] for i in seed]
        generated = seed[:] + ['#']

        # Find the actual entire sequence
        actual = generated[:] + seq[end_idx:end_idx + new_words]

        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(
                np.float64)

            # Diversify
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]

            next_idx = np.argmax(probas)

            # New seed adds on old word
            #             seed = seed[1:] + [next_idx]
            seed += [next_idx]
            generated.append(next_idx)

        # Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i, '< --- >'))

        gen_list.append(n)

    a = []

    for i in actual:
        a.append(idx_word.get(i, '< --- >'))

    a = a[seed_length:]

    gen_list = [gen[seed_length:seed_length + len(a)] for gen in gen_list]

    if return_output:
        return original_sequence, gen_list, a

    # HTML formatting
    seed_html = ''
    seed_html = addContent(seed_html, header(
        'Seed Sequence', color='darkblue'))
    seed_html = addContent(seed_html,
                           box(remove_spaces(' '.join(original_sequence))))

    gen_html = ''
    gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
    gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

    a_html = ''
    a_html = addContent(a_html, header('Actual', color='darkgreen'))
    a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

    return seed_html, gen_html, a_html



def header(text, color = 'black', gen_text = None):
    if gen_text:
        raw_html = f'<h1 style="color: {color};"><p><center>' + str(
        text) + '<span style="color: red">' + str(gen_text) + '</center></p></h1>'
    else:
        raw_html = f'<h1 style="color: {color};"><center>' + str(
            text) + '</center></h1>'
    return raw_html


def box(text, gen_text=None):
    if gen_text:
        raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;"> <p>' + str(
            text) +'<span style="color: red">' + str(gen_text) + '</p></div>'

    else:
        raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + str(
            text) + '</div>'
    return raw_html


def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

def seed_sequence(model, s, word_idx, idx_word, 
                  diversity = 0.75, num_words = 50):
    """Generate output starting from a seed sequence."""
    # Original formated text
    start = format_sequence(s).split()
    gen = []
    s = start[:]
    # Generate output
    for _ in range(num_words):
        # Conver to arry
        x = np.array([word_idx.get(word, 0) for word in s]).reshape((1, -1))

        # Make predictions
        preds = model.predict(x)[0].astype(float)

        # Diversify
        preds = np.log(preds) / diversity
        exp_preds = np.exp(preds)
        # Softmax
        preds = exp_preds / np.sum(exp_preds)
        # Pick next index
        next_idx = np.argmax(np.random.multinomial(1, preds, size = 1))
        s.append(idx_word[next_idx])
        gen.append(idx_word[next_idx])
    
    # Formatting in html
    start = remove_spaces(' '.join(start)) + ' '
    gen = remove_spaces(' '.join(gen)) 
    html = ''
    html = addContent(html, header('Input Seed ', color = 'black', gen_text = 'Network Output'))
    html = addContent(html, box(start, gen))
    return html

def guess_human(model, sequences, idx_word, seed_length=50):
    """Produce 2 RNN sequences and play game to compare to actaul.
       Diversity is randomly set between 0.5 and 1.25"""
    
    new_words = np.random.randint(10, 50)
    diversity = np.random.uniform(0.5, 1.25)
    sequence, gen_list, actual = generate_output(model, sequences, idx_word, seed_length, new_words,
                                                 diversity=diversity, return_output=True, n_gen = 2)
    gen_0, gen_1 = gen_list
    
    output = {'sequence': remove_spaces(' '.join(sequence)),
              'computer0': remove_spaces(' '.join(gen_0)),
              'computer1': remove_spaces(' '.join(gen_1)),
              'human': remove_spaces(' '.join(actual))}
    
    print(f"Seed Sequence: {output['sequence']}\n")
    
    choices = ['human', 'computer0', 'computer1']
          
    selected = []
    i = 0
    while len(selected) < 3:
        choice = random.choice(choices)
        selected.append(choice)
        print(f'\nOption {i + 1} {output[choice]}')
        choices.remove(selected[-1])
        i += 1
    
    print('\n')
    guess = int(input('Enter option you think is human (1-3): ')) - 1
    print('\n')
    
    if guess == np.where(np.array(selected) == 'human')[0][0]:
        print('*' * 3 + 'Correct' + '*' * 3 + '\n')
        print('-' * 60)
        print('Ordering: ', selected)
    else:
        print('*' * 3 + 'Incorrect' + '*' * 3 + '\n')
        print('-' * 60)
        print('Correct Ordering: ', selected)
          
    print('Diversity', round(diversity, 2))

Overwriting utils.py


In [19]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import HTML

InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)
warnings.filterwarnings('ignore', category = UserWarning)

import pandas as pd
import numpy as np
from utils import get_data, generate_output, guess_human, seed_sequence, get_embeddings, find_closest

In [12]:
df = pd.read_csv(patent)
df.head()

Unnamed: 0,patent_abstract,patent_date,patent_number,patent_title
0,""" A """"Barometer"""" Neuron enhances stability in...",1996-07-09,5535303,"""""""Barometer"""" neuron for a neural network"""
1,""" This invention is a novel high-speed neural ...",1993-10-19,5255349,"""Electronic neural network for solving """"trave..."
2,An optical information processor for use as a ...,1995-01-17,5383042,3 layer liquid crystal neural network with out...
3,A method and system for intelligent control of...,2001-01-02,6169981,3-brain architecture for an intelligent decisi...
4,A method and system for intelligent control of...,2003-06-17,6581048,3-brain architecture for an intelligent decisi...


### 1) Data Preparation
1. Remove punctuation and split strings into lists of individual words
2. Convert the individual words into integers

In [13]:
training_dict, word_idx, idx_word, sequences = get_data(patent, training_len = 50)

There are 16192 unique words.
There are 318563 sequences.


In [17]:
training_dict['X_train'][:2]
training_dict['y_train'][:2]

array([[  117,     7,   141,   277,     4,    18,    81,   110,    10,
          219,    29,     1,   952,  2453,    19,     5,     6,     1,
          117,    10,   182,  2166,    21,     1,    81,   178,     4,
           13,   117,   894,    14,  6163,     7,   302,     1,     9,
            8,    29,    33,    23,    74,   428,     7,   692,     1,
           81,   183,     4,    13,   117],
       [    6,    41,     2,    87,     3,  1340,    79,     7,     1,
          409,   543,    22,   484,     6,     2,  2113,   728,    24,
            1,   178,     3,     1,  1820,    55,    14, 13942,  7240,
          244,     5,    14, 13943,  7240,   244,     5,     2,  2113,
         7240,   244,     5,     2,    38,  9292,   244,     2,    49,
         9292,   244,    14,    22, 13944]])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

### 2) Features and Labels 
1. Use the first 50 words as Features and 51th as the Label (반복) 
2. Features shape (296866, 50)
  - 30000 sequences each with 50 tokens 
  - 50 timesteps each with 1 feature 
3. Label One-hot encoding 
  - to train most effectively 

- idx_word : attribute of the trained tokenizer to figure out what each of these integers means 
  - 정수를 문자열로 변환하여 내용을 확인 할 수 있다 

In [23]:
for i, sequence in enumerate(training_dict['X_train'][:2]):
    text = []
    for idx in sequence:
        text.append(idx_word[idx])

# features: 단어 50개 
# labels : 51번째 단어  
      
    print('Features: ' + ' '.join(text) + '\n')
    print('Label: ' + idx_word[np.argmax(training_dict['y_train'][i])] + '\n')

Features: user to provide samples . A recognition operation is performed on the user's handwritten input , and the user is not satisfied with the recognition result . The user selects an option to train the neural network on one or more characters to improve the recognition results . The user

Label: is

Features: and includes a number of amplifiers corresponding to the N bit output sum and a carry generation from the result of the adding process an augend input-synapse group , an addend input-synapse group , a carry input-synapse group , a first bias-synapse group a second bias-synapse group an output feedback-synapse

Label: group



### 3) Modeling
1. Build LSTM model with Embedding, LSTM, and Dense layers
3. Train model to predict next work in sequence
4. Make predictions by passing in starting sequence

In [25]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import plot_model

In [26]:
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=len(word_idx) + 1, output_dim=100, weights=None, trainable=True)) 
# Recurrent layer : dropout will prevent overfitting 
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))  
# Fully connected layer 
model.add(Dense(64, activation='relu'))  
# Dropout for regularization 
model.add(Dropout(0.5))  
# Output layer : produce a probability for using softmax activation 
model.add(Dense(len(word_idx) + 1, activation='softmax'))   

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16192)             1052480   
Total params: 2,718,080
Trainable params: 2,718,080
Non-trainable params: 0
_________________________________________________________________


In [27]:
from keras.models import load_model

# Load in model and demonstrate training
model = load_model(embedding1)
h = model.fit(training_dict['X_train'], training_dict['y_train'], epochs = 5, batch_size = 2048, 
          validation_data = (training_dict['X_valid'], training_dict['y_valid']), 
          verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


- Evaliating Model

In [28]:
model = load_model(embedding1)
print('Model Performance: Log Loss and Accuracy on training data')
model.evaluate(training_dict['X_train'], training_dict['y_train'], batch_size = 2048)

print('\nModel Performance: Log Loss and Accuracy on validation data')
model.evaluate(training_dict['X_valid'], training_dict['y_valid'], batch_size = 2048)

Model Performance: Log Loss and Accuracy on training data


[3.289726495742798, 0.3384440839290619]


Model Performance: Log Loss and Accuracy on validation data


[5.132135391235352, 0.2671891450881958]

### Patent Abstract Generation

In [29]:
for i in generate_output(model, sequences, idx_word, seed_length = 50, new_words = 30, diversity = 0.75):
    HTML(i)

In [30]:
for i in generate_output(model, sequences, idx_word, seed_length = 30, new_words = 30, diversity = 1.5):
    HTML(i)

In [31]:
s = 'This patent provides a basis for using a recurrent neural network to '
HTML(seed_sequence(model, s, word_idx, idx_word, diversity = 0.75, num_words = 20))

In [32]:
s = 'The cell state is passed along from one time step to another allowing the '
HTML(seed_sequence(model, s, word_idx, idx_word, diversity = 0.75, num_words = 20))

### Guess Game: Human or Machine? 🕹

In [33]:
guess_human(model, sequences, idx_word)

Seed Sequence: a problem data set representing items associated with the problem is obtained. Evaluation data indicating a state of learning obtained during the learning on the current problem is sequentially stored and displayed. When there is a high possibility of learning protraction during the learning, a message informing


Option 1 < --- > the result of the second metadata are assigned to the phonetic sets of nodes. An activation vector is classified as the linear layer. The network can also be integrated

Option 2 < --- > the user is displayed. When the learning is stopped by the user in this case, the problem data set and evaluation data set are stored. Then, a

Option 3 < --- > the linear process, a new input layer of a certain set can also as a neural network for drawing the shot component of the extracted convolutional neural network. The


Enter option you think is human (1-3): 2


***Correct***

------------------------------------------------------------
Ordering:  ['comp

In [34]:
guess_human(model, sequences, idx_word)

Seed Sequence: between the applied energization control and the motor's response may be employed by a neural network to estimate the regions of operation of the motor. And a system for controlling the operation of motor may employ this information, the neural network, or both to regulate the energization


Option 1 < --- > of a motor's phase winding during a phase cycle.

Option 2 < --- > or transparent refractor and the presence of incident photons.

Option 3 < --- > or to eliminate the optical system. The method also


Enter option you think is human (1-3): 4


***Incorrect***

------------------------------------------------------------
Correct Ordering:  ['human', 'computer0', 'computer1']
Diversity 0.64


In [35]:
guess_human(model, sequences, idx_word)

Seed Sequence: to pass through a variable conductance. The conductance is a function of the states of the one or more interconnecting neighboring cells. Proper interconnection of the cells on a layer can produce a neural network which is sensitive to predetermined patterns or the passage of such patterns across


Option 1 < --- > the coupling pairs in a neural network which further provide the information into such matching an low amplitude that between the cells characterized. The positive distributions of the time by the distance in the same weighted function may be employed and so

Option 2 < --- > a sensor array whose signals are input into the network. The layers in the network can be made sensitive to distinct sensory parameters, so that networks which are sensitive to different wavelengths or polarizations of light energy can be produced.

Option 3 < --- > the charge based upon processing the internal state. Accordingly, or additionally includes: one mode for a geophysical amoun

### Building Modeling: Pre-trained Embeddings
  - Use Glove algorithm and trained on Wikipedia 
  - Notincluded in word embeddings ➡️ **100-d vectors of all zeros**
    - Train own embeddings or Set the Embedding layer's trainable= True

In [36]:
embeddings = get_embeddings(model)
embeddings.shape

(16192, 100)

- Using Cosine Similarity 
   - find the words closests to a given query word in the embedding space

In [37]:
find_closest('network', embeddings, word_idx, idx_word)

Query: network

Word: network         Cosine Similarity: 1.0
Word: channel         Cosine Similarity: 0.7754999995231628
Word: networks        Cosine Similarity: 0.7745000123977661
Word: system          Cosine Similarity: 0.7559999823570251
Word: program         Cosine Similarity: 0.7541999816894531
Word: cable           Cosine Similarity: 0.7419999837875366
Word: now             Cosine Similarity: 0.7297999858856201
Word: programming     Cosine Similarity: 0.7179999947547913
Word: web             Cosine Similarity: 0.7138000130653381
Word: line            Cosine Similarity: 0.6915000081062317


In [38]:
find_closest('data', embeddings, word_idx, idx_word)

Query: data

Word: data            Cosine Similarity: 1.0
Word: information     Cosine Similarity: 0.8185999989509583
Word: numbers         Cosine Similarity: 0.683899998664856
Word: database        Cosine Similarity: 0.6776000261306763
Word: account         Cosine Similarity: 0.6575999855995178
Word: report          Cosine Similarity: 0.6575999855995178
Word: signals         Cosine Similarity: 0.6399999856948853
Word: system          Cosine Similarity: 0.6377000212669373
Word: statistics      Cosine Similarity: 0.6371999979019165
Word: web             Cosine Similarity: 0.6359000205993652


In [39]:
import requests 

response = requests.get('https://google.com/') 
print(response) 

<Response [200]>


In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
max_features = 10000
max_len = 500
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) 
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 500)
x_test shape: (25000, 500)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop

model = Sequential()
model.add(layers.Embedding(max_features, 128, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.summary()
model.compile(optimizer=RMSprop(lr=1e-4),
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 128)          1280000   
_________________________________________________________________
conv1d (Conv1D)              (None, 494, 32)           28704     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 98, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 92, 32)            7200      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,315,937
Trainable params: 1,315,937
Non-trainable params: 0
____________________________________________