In [1]:
import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('Sarcasm_Headlines_Dataset.json'))

import pandas as pd

df = pd.DataFrame(data)

df

df.drop(['article_link'], axis=1,inplace=True)

df

df.shape

df.isnull().sum()

df['headline'][4]

'j.k. rowling wishes snape happy birthday in the most magical way'

In [2]:
"""## Normalizing the Text data"""

import nltk
nltk.download("stopwords")

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\The
[nltk_data]     ChainSmokers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
ps = PorterStemmer()

col1 = df['headline']

corpus=[]
for i in range(len(df)):
    review= re.sub('[^a-zA-Z]', ' ', col1[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review= ' '.join(review)
    corpus.append(review)

corpus

['former versac store clerk sue secret black code minor shopper',
 'roseann reviv catch thorni polit mood better wors',
 'mom start fear son web seri closest thing grandchild',
 'boehner want wife listen come altern debt reduct idea',
 'j k rowl wish snape happi birthday magic way',
 'advanc world women',
 'fascin case eat lab grown meat',
 'ceo send kid school work compani',
 'top snake handler leav sink huckabe campaign',
 'friday morn email insid trump presser age',
 'airlin passeng tackl man rush cockpit bomb threat',
 'facebook reportedli work healthcar featur app',
 'north korea prais trump urg us voter reject dull hillari',
 'actual cnn jeffrey lord indefens',
 'barcelona hold huge protest support refuge',
 'nuclear bomb deton rehears spider man music',
 'cosbi lawyer ask accus come forward smear legal team year ago',
 'stock analyst confus frighten boar market',
 'bloomberg program build better citi got bigger',
 'craig hick indict',
 'courtroom sketch artist clear manga influe

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

voc_size=10000

one_hot_rep = [one_hot(word,voc_size) for word in corpus]

one_hot_rep

In [1]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Bidirectional


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [4]:

sent_len=30
embedded_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sent_len)

embedded_docs

dimensions = 100

model=Sequential()
model.add(Embedding(voc_size,dimensions,input_length=sent_len))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(100,dropout=0.2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           1000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 30, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
flatten (Flatten)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               25728     
_______________________________________________

In [5]:
import numpy as np
X = np.array(embedded_docs)
y = np.array(df['is_sarcastic'])

y

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=42)

y_train.shape



(20031,)

In [6]:

model.fit(X_train,y_train, epochs=5, batch_size=32, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x200a0874a90>

In [7]:
y_pred = model.predict_classes(X_test)

y_pred



array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[2931,  804],
       [ 793, 2150]], dtype=int64)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7608565438754118

In [None]:

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [2]:
from tensorflow.keras.models import load_model

In [3]:
model = load_model('Sarcasm_model.h5')

In [5]:
from tensorflow.keras.preprocessing.text import one_hot

In [12]:
headline = 'Mom Warns Son to Watch Out for Idiots Rear‑Ends His Motorcycle'
# headline = 'Cows lose their jobs as milk prices drop'
# headline = 'Man Accused of Killing Lawyer Receives a New Attorney'
# headline = 'India to have over two billion vaccine doses during Aug-Dec'
# headline = '15 more patients die at Goa Medical College due to oxygen shortage'
# headline = 'City Union Bank donates Rs 1 crore to Relief Fund'
headline_len = len(headline)

onehot=[one_hot(headline, headline_len)]
result = pad_sequences(onehot, padding='pre', maxlen=headline_len)

final = model.predict(result)



In [13]:
final[0]

array([0.03790373], dtype=float32)

In [14]:
if final > 0.5:
  print("Sarcastic")
else:
  print("Not Sarcastic")

Not Sarcastic


In [11]:
model.save("Sarcasm_model.h5")