<a href="https://colab.research.google.com/github/SWLee1212/TensorFlow-in-Practice/blob/master/NLP_in_TensorFlow_2_week_sarcasm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
print(tf.__version__)

1.14.0


In [0]:
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [0]:
vocab_size = 10000        # vocab size 
embedding_dim = 16       # embedding vector dimension 16
max_length = 32           # max input sentence length 32
trunc_type = 'post'       # if input sentence is larger than 32, truncate post
padding_type = 'post'     # if input sentence is shorter than 32, padding post
oov_tok = '<oov>'         # out of vocab 
training_size = 20000     # training data length 

In [4]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json
  

--2019-10-07 23:18:18--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.141.128, 2607:f8b0:400c:c06::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2019-10-07 23:18:18 (149 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [0]:
with open("/tmp/sarcasm.json",'r') as f:
  datastore = json.load(f)

In [0]:
sentences = []
labels  = []

for item in datastore:  
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

In [7]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

print(len(training_sentences), len(testing_sentences))


20000 6709


In [0]:
# making tokenizer, word to sequence number
# make sequence from training sentences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen = max_length,
                                padding=padding_type, truncating = trunc_type)

# make sequneces from tokenizer which is formed by training sentences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length, 
                               padding=padding_type, truncating = trunc_type)




In [14]:
model = tf.keras.Sequential([
              tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
              tf.keras.layers.GlobalAveragePooling1D(),
              tf.keras.layers.Dense(24, activation='relu'),
              tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()  

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 32, 16)            160000    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [18]:
num_epochs = 30

history = model.fit(training_padded, training_labels, epochs=num_epochs,
                    validation_data = (testing_padded, testing_labels), verbose = 2)



Train on 20000 samples, validate on 6709 samples
Epoch 1/30
20000/20000 - 3s - loss: 0.5600 - acc: 0.7024 - val_loss: 0.3935 - val_acc: 0.8399
Epoch 2/30
20000/20000 - 2s - loss: 0.3100 - acc: 0.8760 - val_loss: 0.3476 - val_acc: 0.8490
Epoch 3/30
20000/20000 - 2s - loss: 0.2318 - acc: 0.9093 - val_loss: 0.3549 - val_acc: 0.8474
Epoch 4/30
20000/20000 - 2s - loss: 0.1859 - acc: 0.9288 - val_loss: 0.3658 - val_acc: 0.8556
Epoch 5/30
20000/20000 - 3s - loss: 0.1539 - acc: 0.9424 - val_loss: 0.4042 - val_acc: 0.8438
Epoch 6/30
20000/20000 - 3s - loss: 0.1291 - acc: 0.9545 - val_loss: 0.4349 - val_acc: 0.8459
Epoch 7/30
20000/20000 - 2s - loss: 0.1109 - acc: 0.9616 - val_loss: 0.4790 - val_acc: 0.8430
Epoch 8/30
20000/20000 - 3s - loss: 0.0958 - acc: 0.9668 - val_loss: 0.5210 - val_acc: 0.8405
Epoch 9/30
20000/20000 - 3s - loss: 0.0827 - acc: 0.9716 - val_loss: 0.5743 - val_acc: 0.8353
Epoch 10/30
20000/20000 - 3s - loss: 0.0706 - acc: 0.9776 - val_loss: 0.6308 - val_acc: 0.8301
Epoch 11/3

In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [25]:
def plot_graphs(history):
  fig = make_subplots(rows = 1, cols=2, subplot_titles=('Acc','Loss'))

  fig.add_trace(
      go.Scatter(y=history.history['acc'], name='train acc'), row=1, col=1
  )
  fig.add_trace(
      go.Scatter(y=history.history['val_acc'], name='test acc'), row=1, col=1
  )
  fig.add_trace(
      go.Scatter(y=history.history['loss'], name='train loss'), row=1, col=2
  )
  fig.add_trace(
      go.Scatter(y=history.history['val_loss'], name='test loss'), row=1, col=2
  )
  
  fig.show()

plot_graphs(history)

In [0]:
vocab_size = 1000
embedding_dim = 32
max_length = 16


In [0]:
def gen_model(vocab_size, embedding_dim, max_length):

  tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(training_sentences)

  word_index = tokenizer.word_index

  training_sequences = tokenizer.texts_to_sequences(training_sentences)
  training_padded = pad_sequences(training_sequences, maxlen = max_length,
                                  padding=padding_type, truncating = trunc_type)

  # make sequneces from tokenizer which is formed by training sentences
  testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
  testing_padded = pad_sequences(testing_sequences, maxlen = max_length, 
                                padding=padding_type, truncating = trunc_type)
  
  model = tf.keras.Sequential([
              tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
              tf.keras.layers.GlobalAveragePooling1D(),
              tf.keras.layers.Dense(24, activation='relu'),
              tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  model.summary()  

  return model


In [28]:
model2 =  gen_model(vocab_size, embedding_dim, max_length)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 16, 32)            32000     
_________________________________________________________________
global_average_pooling1d_3 ( (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 24)                792       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 25        
Total params: 32,817
Trainable params: 32,817
Non-trainable params: 0
_________________________________________________________________


In [29]:
history = model.fit(training_padded, training_labels, epochs=num_epochs,
                    validation_data = (testing_padded, testing_labels), verbose = 2)

Train on 20000 samples, validate on 6709 samples
Epoch 1/30
20000/20000 - 2s - loss: 0.0080 - acc: 0.9977 - val_loss: 1.9608 - val_acc: 0.8037
Epoch 2/30
20000/20000 - 2s - loss: 0.0076 - acc: 0.9977 - val_loss: 1.9831 - val_acc: 0.8016
Epoch 3/30
20000/20000 - 2s - loss: 0.0071 - acc: 0.9981 - val_loss: 1.9576 - val_acc: 0.7976
Epoch 4/30
20000/20000 - 2s - loss: 0.0059 - acc: 0.9987 - val_loss: 2.0864 - val_acc: 0.8009
Epoch 5/30
20000/20000 - 2s - loss: 0.0052 - acc: 0.9985 - val_loss: 2.1862 - val_acc: 0.7989
Epoch 6/30
20000/20000 - 2s - loss: 0.0065 - acc: 0.9983 - val_loss: 2.2350 - val_acc: 0.7992
Epoch 7/30
20000/20000 - 2s - loss: 0.0048 - acc: 0.9985 - val_loss: 2.2341 - val_acc: 0.8010
Epoch 8/30
20000/20000 - 2s - loss: 0.0062 - acc: 0.9979 - val_loss: 2.1946 - val_acc: 0.7956
Epoch 9/30
20000/20000 - 2s - loss: 0.0057 - acc: 0.9980 - val_loss: 2.3531 - val_acc: 0.8001
Epoch 10/30
20000/20000 - 2s - loss: 0.0050 - acc: 0.9984 - val_loss: 2.3594 - val_acc: 0.7982
Epoch 11/3

In [30]:
plot_graphs(history)