In [1]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,Flatten,Dropout
from tensorflow import keras
from keras.callbacks import EarlyStopping
import pickle
from keras.models import load_model

In [2]:
import spacy

In [3]:
nlp=spacy.load('en_core_web_lg')

In [4]:
df2 = pd.read_json("Sarcasm_Headlines_Dataset.json", 'r',lines=True)
dff2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", 'r',lines=True)
df =  pd.concat([df2, dff2])

  """Entry point for launching an IPython kernel.
  


In [5]:
df[1998:2004]

Unnamed: 0,article_link,headline,is_sarcastic
1998,https://politics.theonion.com/precious-little-...,precious little voter needs to feel inspired b...,1
1999,https://politics.theonion.com/last-line-of-oba...,last line of obama's military force request br...,1
2000,https://www.huffingtonpost.com/entry/mh370-the...,"couple stole $35,000 from missing plane victim...",0
2001,https://www.huffingtonpost.com/entry/dodd-fran...,dodd-frank at four,0
2002,https://www.huffingtonpost.com/entry/what-bein...,what being a christian means to me: don't worr...,0
2003,https://www.huffingtonpost.com/entry/mom-kicks...,video shows mom kicking child out for voting f...,0


In [6]:
def clean_text(text):
    text = text.lower()
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = text.lower()
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [7]:
# new_text=[]
# for text in df['data']:  
#       new_text.append(clean_text(text))
# df['clean_text']=new_text

In [8]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [9]:
X=df['headline'].values
y=df['is_sarcastic'].values

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [11]:
vocab_size=3000
embedding_dim=32
max_len=32
trunc_type='post'
padding_type='post'

In [12]:
tokenizer= Tokenizer(num_words=vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(X_train)

training_sequences=tokenizer.texts_to_sequences(X_train)
training_padded=pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

testing_sequences=tokenizer.texts_to_sequences(X_test)
testing_padded=pad_sequences(testing_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [13]:
def create_model(vocabulary_size,embedding_dim,seq_len):
    model=Sequential()
    model.add(Embedding(vocabulary_size,embedding_dim,input_length=seq_len))
    model.add(LSTM(64,dropout=0.2,recurrent_dropout=0.25))
    model.add(Dense(1,activation='sigmoid'))
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])
    model.summary()
    return model

In [14]:
model=create_model(vocab_size+1,embedding_dim,max_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 32)            96032     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 120,929
Trainable params: 120,929
Non-trainable params: 0
_________________________________________________________________


In [15]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=7)

In [16]:
model.fit(training_padded,y_train,batch_size=128,epochs=50,verbose=2,validation_data=(testing_padded,y_test),callbacks=[es])

Epoch 1/50
303/303 - 34s - loss: 0.6454 - accuracy: 0.6262 - val_loss: 0.6600 - val_accuracy: 0.5905
Epoch 2/50
303/303 - 25s - loss: 0.4517 - accuracy: 0.7884 - val_loss: 0.3676 - val_accuracy: 0.8408
Epoch 3/50
303/303 - 19s - loss: 0.3193 - accuracy: 0.8653 - val_loss: 0.3285 - val_accuracy: 0.8663
Epoch 4/50
303/303 - 18s - loss: 0.2748 - accuracy: 0.8884 - val_loss: 0.2918 - val_accuracy: 0.8826
Epoch 5/50
303/303 - 18s - loss: 0.2510 - accuracy: 0.9012 - val_loss: 0.2998 - val_accuracy: 0.8791
Epoch 6/50
303/303 - 19s - loss: 0.2373 - accuracy: 0.9074 - val_loss: 0.2782 - val_accuracy: 0.8896
Epoch 7/50
303/303 - 19s - loss: 0.2103 - accuracy: 0.9195 - val_loss: 0.2773 - val_accuracy: 0.8917
Epoch 8/50
303/303 - 19s - loss: 0.1922 - accuracy: 0.9289 - val_loss: 0.2640 - val_accuracy: 0.9013
Epoch 9/50
303/303 - 18s - loss: 0.1865 - accuracy: 0.9305 - val_loss: 0.2594 - val_accuracy: 0.9065
Epoch 10/50
303/303 - 17s - loss: 0.1729 - accuracy: 0.9362 - val_loss: 0.2695 - val_accura

<tensorflow.python.keras.callbacks.History at 0x1f8acd53ba8>

In [17]:
model.save("sarcasm_model1.h5")

In [18]:
def prediction_text(sent):
    sent=[sent]
    seq=tokenizer.texts_to_sequences(sent)
    padded=pad_sequences(seq,maxlen=max_len,padding=padding_type, truncating=trunc_type)
    return model.predict(padded)
    

In [19]:
sent="you broke my car,good job!"
print(prediction_text(sent)) 

[[0.11280417]]


In [20]:
# 2nd model
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model2.add(Flatten())

model2.add(Dense(units=32,activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(units=10,activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(units=1,activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.01)
model2.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 32)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                32800     
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                330       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [21]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=7)

In [22]:
model2.fit(training_padded,y_train,batch_size=128,epochs=10,verbose=2,validation_data=(testing_padded,y_test),callbacks=[es])

Epoch 1/10
303/303 - 2s - loss: 0.4210 - accuracy: 0.8035 - val_loss: 0.2792 - val_accuracy: 0.8851
Epoch 2/10
303/303 - 1s - loss: 0.2585 - accuracy: 0.9019 - val_loss: 0.2441 - val_accuracy: 0.9071
Epoch 3/10
303/303 - 1s - loss: 0.1816 - accuracy: 0.9351 - val_loss: 0.2244 - val_accuracy: 0.9225
Epoch 4/10
303/303 - 1s - loss: 0.1302 - accuracy: 0.9530 - val_loss: 0.2829 - val_accuracy: 0.9222
Epoch 5/10
303/303 - 1s - loss: 0.1106 - accuracy: 0.9603 - val_loss: 0.2951 - val_accuracy: 0.9322
Epoch 6/10
303/303 - 1s - loss: 0.0890 - accuracy: 0.9677 - val_loss: 0.2942 - val_accuracy: 0.9317
Epoch 7/10
303/303 - 1s - loss: 0.0816 - accuracy: 0.9696 - val_loss: 0.3619 - val_accuracy: 0.9349
Epoch 8/10
303/303 - 1s - loss: 0.0731 - accuracy: 0.9742 - val_loss: 0.3841 - val_accuracy: 0.9307
Epoch 9/10
303/303 - 1s - loss: 0.0628 - accuracy: 0.9769 - val_loss: 0.5461 - val_accuracy: 0.9379
Epoch 10/10
303/303 - 1s - loss: 0.0639 - accuracy: 0.9767 - val_loss: 0.5493 - val_accuracy: 0.9363

<tensorflow.python.keras.callbacks.History at 0x1f8b6f5bba8>

In [23]:
model2.save("sarcasm_model2.h5")

In [24]:
def prediction_text2(sent):
    sent=[sent]
    seq=tokenizer.texts_to_sequences(sent)
    padded=pad_sequences(seq,maxlen=max_len,padding=padding_type, truncating=trunc_type)
    return model2.predict(padded)

In [25]:
sent="you broke my car , good job"
print(prediction_text2(sent))

[[0.9998937]]
