In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json
import urllib

In [2]:
corpus_data=[]
corpus_label=[]
with open('sarcasm.json', 'r', errors='ignore') as json_data:
    
    ##Converting TextWrapper to List
    json_lines=json_data.readlines()
    ## Removing first and last values as it containg '[' and ']'
    json_lines=json_lines[1:-1]
    
    for data in json_lines:
        #print(type(data))
        ## Each line consists of , and \n new space so we will split that and conside the first element which
        ##has the json data and process
        line_data=data.split(',\n')
        
        line_data_list=line_data[0]
        line_data_json=json.loads(line_data_list)
        
        corpus_data.append(line_data_json['headline'])
        corpus_label.append(line_data_json['is_sarcastic'])
        
        

In [3]:
corpus_data[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [4]:
corpus_label[:10]

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0]

In [5]:
len(corpus_data)

26709

In [6]:
vocab_size=20000

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
token= Tokenizer(num_words=vocab_size, oov_token='OOV')
token.fit_on_texts(corpus_data)
vocab_size=len(token.word_index)


In [9]:
corpus_seq= token.texts_to_sequences(corpus_data)

In [10]:
corpus_seq[:10]

[[308, 15115, 679, 3337, 2298, 48, 382, 2576, 15116, 6, 2577, 8434],
 [4, 8435, 3338, 2746, 22, 2, 166, 8436, 416, 3112, 6, 258, 9, 1002],
 [145, 838, 2, 907, 1749, 2093, 582, 4719, 221, 143, 39, 46, 2, 10736],
 [1485, 36, 224, 400, 2, 1832, 29, 319, 22, 10, 2924, 1393, 6969, 968],
 [767, 719, 4720, 908, 10737, 623, 594, 5, 4, 95, 1309, 92],
 [10738, 4, 365, 73],
 [4, 6970, 351, 6, 461, 4274, 2195, 1486],
 [19, 479, 39, 1168, 31, 155, 2, 99, 83, 18, 158, 6, 32, 352],
 [249, 3623, 6971, 555, 5274, 1995, 141],
 [2094, 326, 347, 401, 60, 15117, 6, 4, 3896]]

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
corpus_pad= pad_sequences(corpus_seq,maxlen=30, padding='post', truncating='post')

In [13]:
corpus_pad[1]

array([   4, 8435, 3338, 2746,   22,    2,  166, 8436,  416, 3112,    6,
        258,    9, 1002,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [14]:
corpus_label=np.array(corpus_label)

In [15]:
split_data=25000
train_data= corpus_pad[:split_data]
train_label=corpus_label[:split_data]
test_data= corpus_pad[split_data:]
test_label= corpus_label[split_data:]

In [16]:
train_data.shape,test_data.shape

((25000, 30), (1709, 30))

In [17]:
test_label.shape

(1709,)

In [18]:
from tensorflow.keras.layers import Embedding,LSTM, Dropout, Dense, Bidirectional

In [29]:
keras.backend.clear_session()
model= keras.Sequential()
model.add(Embedding(input_dim=vocab_size+1, output_dim=100, input_length=30,))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(LSTM(64))
model.add(Dropout(rate=0.5))
model.add(Dense(1, activation=tf.nn.sigmoid))
loss= keras.losses.BinaryCrossentropy()
optimizer= keras.optimizers.Adam(lr= 5e-4)
model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           2965800   
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 128)           84480     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,099,753
Trainable params: 3,099,753
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(train_data, train_label, batch_size=128, epochs=25, validation_data=(test_data, test_label))

Train on 25000 samples, validate on 1709 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f7cdc37d1d0>

In [32]:
import tensorflow_datasets as tfds

In [38]:
tfds.list_builders()

['abstract_reasoning',
 'aeslc',
 'aflw2k3d',
 'amazon_us_reviews',
 'arc',
 'bair_robot_pushing_small',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'c4',
 'caltech101',
 'caltech_birds2010',
 'caltech_birds2011',
 'cars196',
 'cassava',
 'cats_vs_dogs',
 'celeb_a',
 'celeb_a_hq',
 'chexpert',
 'cifar10',
 'cifar100',
 'cifar10_1',
 'cifar10_corrupted',
 'citrus_leaves',
 'cityscapes',
 'civil_comments',
 'clevr',
 'cmaterdb',
 'cnn_dailymail',
 'coco',
 'coil100',
 'colorectal_histology',
 'colorectal_histology_large',
 'cos_e',
 'curated_breast_imaging_ddsm',
 'cycle_gan',
 'deep_weeds',
 'definite_pronoun_resolution',
 'diabetic_retinopathy_detection',
 'dmlab',
 'downsampled_imagenet',
 'dsprites',
 'dtd',
 'duke_ultrasound',
 'dummy_dataset_shared_generator',
 'dummy_mnist',
 'emnist',
 'esnli',
 'eurosat',
 'fashion_mnist',
 'flic',
 'flores',
 'food101',
 'gap',
 'gigaword',
 'glue',
 'groove',
 'higgs',
 'horses_or_humans',
 'i_natura