In [1]:
import json
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
import string

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_json("Sarcasm_Headlines_Dataset.json",lines = True)
df.head(10)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0


In [4]:
df.head(-100) # last 100 datas will be deducted

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26604,https://www.huffingtonpost.com/entry/roger-ail...,roger ailes hires lawyer for possible lawsuit ...,0
26605,https://www.huffingtonpost.com/entry/laquan-mc...,police have no idea how laquan mcdonald footag...,0
26606,https://entertainment.theonion.com/tv-viewers-...,tv viewers outraged at timing of commercial break,1
26607,https://www.huffingtonpost.com/entry/apostolos...,greek and turkish cypriots find common ground ...,0


In [5]:
df.tail()

Unnamed: 0,article_link,headline,is_sarcastic
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0
26708,https://www.huffingtonpost.com/entry/gourmet-g...,gourmet gifts for the foodie 2014,0


In [6]:
df.drop(['article_link'],axis = 1)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [8]:
label = df['is_sarcastic'].tolist()

In [9]:
sentence[0:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
stop_words = set(stopwords.words("english"))

# Function to remove stopwords from a single sentence
def remove_stopwords(sentence):
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Remove stopwords from each sentence in the list
sentence = [remove_stopwords(sentence) for sentence in sentence]

# print the sentences without stopwords
sentence[:10]

["former versace store clerk sues secret 'black code ' minority shoppers",
 "'roseanne ' revival catches thorny political mood , better worse",
 "mom starting fear son 's web series closest thing grandchild",
 'boehner wants wife listen , come alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday magical way',
 "advancing world 's women",
 'fascinating case eating lab-grown meat',
 'ceo send kids school , work company',
 'top snake handler leaves sinking huckabee campaign',
 "friday 's morning email : inside trump 's presser ages"]

In [12]:
train_size = round(len(sentence)*0.75)
train_sen = sentence[0:train_size]
test_sen = sentence[train_size:]
train_label = label[0:train_size]
test_label = label[train_size:]

In [13]:
vocab_size = 10000
oov_tok = "oov"

In [14]:
tokenizer = Tokenizer(num_words = vocab_size,oov_token = oov_tok)
tokenizer.fit_on_texts(train_sen)
word_index = tokenizer.word_index

In [15]:
(word_index)

{'oov': 1,
 "'s": 2,
 "'": 3,
 'trump': 4,
 'new': 5,
 'man': 6,
 "n't": 7,
 'year': 8,
 'one': 9,
 'report': 10,
 'area': 11,
 'woman': 12,
 'u': 13,
 'day': 14,
 'donald': 15,
 'says': 16,
 'time': 17,
 's': 18,
 'first': 19,
 'obama': 20,
 'women': 21,
 'like': 22,
 'old': 23,
 'get': 24,
 'world': 25,
 'people': 26,
 'life': 27,
 'nation': 28,
 'clinton': 29,
 'house': 30,
 'back': 31,
 'white': 32,
 'could': 33,
 'still': 34,
 'make': 35,
 '5': 36,
 'americans': 37,
 'way': 38,
 'family': 39,
 'gop': 40,
 'study': 41,
 'president': 42,
 'black': 43,
 'show': 44,
 'would': 45,
 'best': 46,
 'school': 47,
 'bill': 48,
 'years': 49,
 '3': 50,
 'police': 51,
 'america': 52,
 'know': 53,
 'hillary': 54,
 'watch': 55,
 'last': 56,
 'really': 57,
 '10': 58,
 'things': 59,
 'video': 60,
 'ca': 61,
 'going': 62,
 'death': 63,
 'good': 64,
 'state': 65,
 'american': 66,
 'finds': 67,
 'mom': 68,
 'home': 69,
 'love': 70,
 'may': 71,
 'need': 72,
 'child': 73,
 'health': 74,
 'say': 75,
 '2'

## Padding

In [16]:
max_length = 100 #max length of a sentence can be 100, if not specified'
#then the length of the longest sentence is sent
trunc_type = 'post' #after a sentence,words are truncated
padding_type = 'post'

In [17]:
training_sequences = tokenizer.texts_to_sequences(train_sen)

training_padded = pad_sequences(training_sequences, maxlen = max_length, padding=padding_type, truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sen)

testing_padded = pad_sequences(testing_sequences, maxlen=max_length,padding=padding_type,truncating = trunc_type)#padding 
#is done to fill the empty spaces.

In [18]:
embedding_dim = 16 # embedding determine the context of the text.
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,embedding_dim, input_length = max_length),
# dim determines dimention.
                             
tf.keras.layers.GlobalAveragePooling1D(),
                            
tf.keras.layers.Dense(24, activation='relu'), 
tf.keras.layers.Dense(1, activation='sigmoid')]) 

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
training_padded = np.array(training_padded)
training_labels = np.array(train_label)
testing_padded = np.array(testing_padded)
testing_labels = np.array(test_label)
#converting the list to array for tensorflow

In [20]:
#training the model...finally ghar ja sakte hai 

num_epochs = 30
history = model.fit(training_padded,training_labels,epochs = num_epochs, validation_data = (testing_padded, testing_labels), verbose = 2)

Epoch 1/30
626/626 - 5s - loss: 0.6817 - accuracy: 0.5602 - val_loss: 0.6653 - val_accuracy: 0.5645 - 5s/epoch - 8ms/step
Epoch 2/30
626/626 - 3s - loss: 0.5887 - accuracy: 0.7051 - val_loss: 0.5186 - val_accuracy: 0.7673 - 3s/epoch - 5ms/step
Epoch 3/30
626/626 - 3s - loss: 0.4314 - accuracy: 0.8207 - val_loss: 0.4491 - val_accuracy: 0.7984 - 3s/epoch - 5ms/step
Epoch 4/30
626/626 - 3s - loss: 0.3476 - accuracy: 0.8564 - val_loss: 0.4286 - val_accuracy: 0.8049 - 3s/epoch - 5ms/step
Epoch 5/30
626/626 - 3s - loss: 0.2984 - accuracy: 0.8800 - val_loss: 0.4232 - val_accuracy: 0.8125 - 3s/epoch - 5ms/step
Epoch 6/30
626/626 - 3s - loss: 0.2661 - accuracy: 0.8916 - val_loss: 0.4293 - val_accuracy: 0.8117 - 3s/epoch - 5ms/step
Epoch 7/30
626/626 - 3s - loss: 0.2402 - accuracy: 0.9031 - val_loss: 0.4402 - val_accuracy: 0.8096 - 3s/epoch - 5ms/step
Epoch 8/30
626/626 - 3s - loss: 0.2207 - accuracy: 0.9113 - val_loss: 0.4743 - val_accuracy: 0.8041 - 3s/epoch - 5ms/step
Epoch 9/30
626/626 - 3s 

In [21]:
sen = ["Coworkers at bathroom sink locked in tense standoff over is going to wash hands longer",
      "The covid cases are rising"]
seq = tokenizer.texts_to_sequences(sen)
padded = pad_sequences(seq,maxlen=max_length,padding = padding_type,truncating=trunc_type)
print(model.predict(padded))

[[0.99998635]
 [0.01638081]]
