In [0]:
#!unzip train.csv.zip

In [3]:
import numpy as np  
import pandas as pd 
import re           
from bs4 import BeautifulSoup 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

train_df.isnull().values.any()

False

In [0]:
train_df["selected_text"] = train_df["selected_text"].apply(lambda x: "_START_ " + x + " _END_")

In [0]:
df = train_df["text"]+test_df["text"] + train_df["selected_text"]
df = df.fillna("")

x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(df)

#convert text sequences into integer sequences
train_seq    =   x_tokenizer.texts_to_sequences(train_df["text"])
test_seq   =   x_tokenizer.texts_to_sequences(test_df["text"])

X_VOCAB_SIZE = len(x_tokenizer.word_index) + 1

In [7]:
SEQ_LEN = max(map(len, train_seq))
print(SEQ_LEN)

35


In [0]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(train_df["selected_text"])

#convert text sequences into integer sequences
y_seq    =   x_tokenizer.texts_to_sequences(train_df["selected_text"])

Y_VOCAB_SIZE = len(y_tokenizer.word_index) + 1

In [0]:
train_seq = pad_sequences(train_seq, maxlen=SEQ_LEN)
test_seq = pad_sequences(test_seq, maxlen=SEQ_LEN)
y_seq = pad_sequences(y_seq, maxlen=SEQ_LEN)

In [10]:
from keras import backend as K 
from attention import AttentionLayer
K.clear_session() 
latent_dim = 128 

# Encoder 
encoder_inputs = Input(shape=(SEQ_LEN,)) 
enc_emb = Embedding(X_VOCAB_SIZE, latent_dim, trainable=True)(encoder_inputs) 

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim, return_sequences=True,return_state=True) 
encoder_outputs, state_h, state_c = encoder_lstm1(enc_emb) 

# Set up the decoder. 
decoder_inputs = Input(shape=(SEQ_LEN,)) 
dec_emb_layer = Embedding(Y_VOCAB_SIZE, latent_dim, trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(Y_VOCAB_SIZE, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 35, 128)      1570048     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 35, 128)      2282368     input_2[0][0]                    
______________________________________________________________________________________________

In [11]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history=model.fit([train_seq,y_seq[:,:-1]], y_seq.reshape(y_seq.shape[0],y_seq.shape[1], 1)[:,1:] ,epochs=10,callbacks=[es],batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
reverse_target_word_index=y_tokenizer.index_word 
reverse_source_word_index=x_tokenizer.index_word 
target_word_index=y_tokenizer.word_index

In [0]:
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(SEQ_LEN,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = target_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

            # Exit condition: either hit max length or find stop word.
            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (SEQ_LEN-1)):
                stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [0]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
        newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+reverse_source_word_index[i]+' '
    return newString

In [20]:
for i in range(len(train_seq)):
  print("Review:",seq2text(train_seq[i]))
  print("Original summary:",seq2summary(y_seq[i]))
  print("Predicted summary:",decode_sequence(train_seq[i].reshape(1,SEQ_LEN)))
  print("\n")

Review: i d have responded if i were going 
Original summary: dont so foods great people work i 
Predicted summary:  m have start m have about to good t the hear that the way to the how day that the let start m have start m have another start like i i i i


Review: sooo sad i will miss you here in san diego 
Original summary: tried need i 
Predicted summary:  m have start m have about to good t the hear that the way to the how day that the let start m have start m have another start like i i i i


Review: my boss is bullying me 
Original summary: carry of i 
Predicted summary:  m have start m have about to good t the hear that the way to the how day that the let my start m have about to good a was i i i i


Review: what interview leave me alone 
Original summary: anyone of drink i 
Predicted summary:  m have start m have about to good t the hear that the way to the how day that the let my start m have about to good a was i i i i


Review: sons of why couldn t they put them on the relea

KeyboardInterrupt: ignored