In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [3]:
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [4]:
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [5]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [6]:
df = df.drop(['article_link'], axis=1)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [7]:
df.shape

(26709, 2)

In [8]:
df.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

In [9]:
df.is_sarcastic.value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [10]:
df.headline = df.headline.str.lower()  # headline in lower case

In [11]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_train,df_test = train_test_split(df,test_size=.2)

In [14]:
df_train_x = df_train.iloc[:,0]
df_train_y = df_train.iloc[:,1]

In [15]:
df_test_x = df_test.iloc[:,0]
df_test_y = df_test.iloc[:,1]

In [16]:
df_train_y = to_categorical(df_train_y)

In [17]:
# hyperparameter

max_num_words = 15000

seq_len = 50

embedding_size = 100

In [18]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
tokenizer = Tokenizer(num_words=max_num_words)

In [20]:
tokenizer.fit_on_texts(df.headline)
df_train_x= tokenizer.texts_to_sequences(df_train_x)
df_train_x = pad_sequences(df_train_x,maxlen=seq_len)

In [21]:
tokenizer.fit_on_texts(df.headline)
df_test_x= tokenizer.texts_to_sequences(df_test_x)
df_test_x = pad_sequences(df_test_x,maxlen=seq_len) 

In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_num_words,   # bcoz on those words we are building model
                   input_length=seq_len,
                   output_dim=embedding_size))
model.add(LSTM(4))
model.add(Dense(2,activation='softmax'))     # no of neurons in o/p layer = no of classes(2) 

adam = Adam(learning_rate=0.003)

model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])

In [34]:
model.fit(df_train_x, df_train_y, epochs=8,validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x1a10748d910>

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           1500000   
                                                                 
 lstm (LSTM)                 (None, 4)                 1680      
                                                                 
 dense (Dense)               (None, 2)                 10        
                                                                 
Total params: 1501690 (5.73 MB)
Trainable params: 1501690 (5.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
pred_values = model.predict(df_test_x)



In [37]:
pred_values

array([[9.9930847e-01, 6.9149339e-04],
       [9.4538304e-04, 9.9905461e-01],
       [9.7545642e-01, 2.4543606e-02],
       ...,
       [9.9991643e-01, 8.3550440e-05],
       [2.2244090e-03, 9.9777561e-01],
       [9.9971694e-01, 2.8308498e-04]], dtype=float32)

In [38]:
pred_classes = np.argmax(pred_values,axis=1)

In [39]:
pred_classes

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [40]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [41]:
conf_mat = confusion_matrix(df_test_y,pred_classes)
conf_mat

array([[2540,  421],
       [ 411, 1970]], dtype=int64)

In [42]:
accuracy_score(df_test_y,pred_classes)*100

84.42530887308124