In [2]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,Flatten,Dropout
from tensorflow import keras
from keras.callbacks import EarlyStopping

In [3]:
df2 = pd.read_json("Sarcasm_Headlines_Dataset.json", 'r',lines=True)
dff2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", 'r',lines=True)
df =  pd.concat([df2, dff2])
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
28614,https://www.theonion.com/jews-to-celebrate-ros...,jews to celebrate rosh hashasha or something,1
28615,https://local.theonion.com/internal-affairs-in...,internal affairs investigator disappointed con...,1
28616,https://www.huffingtonpost.com/entry/andrew-ah...,the most beautiful acceptance speech this week...,0
28617,https://www.theonion.com/mars-probe-destroyed-...,mars probe destroyed by orbiting spielberg-gat...,1


In [4]:
def clean_text(text):
    text = text.lower()
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = text.lower()
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [5]:
X=df['headline'].values
y=df['is_sarcastic'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [6]:
vocab_size=3000
embedding_dim=32
max_len=32
trunc_type='post'
padding_type='post'

In [8]:
tokenizer= Tokenizer(num_words=vocab_size, oov_token='OOV')
tokenizer

<keras_preprocessing.text.Tokenizer at 0x2368a8cf1c8>

In [12]:
tokenizer.fit_on_texts(X_train)

In [11]:
training_sequences=tokenizer.texts_to_sequences(X_train)
training_padded=pad_sequences(training_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [12]:
testing_sequences=tokenizer.texts_to_sequences(X_test)
testing_padded=pad_sequences(testing_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [13]:
def create_model(vocabulary_size,embedding_dim,seq_len):
    model=Sequential()
    model.add(Embedding(vocabulary_size,embedding_dim,input_length=seq_len))
    model.add(LSTM(64,dropout=0.2,recurrent_dropout=0.25))
    model.add(Dense(1,activation='sigmoid'))
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])
    model.summary()
    return model

In [14]:
model=create_model(vocab_size+1,embedding_dim,max_len)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=7)
model.fit(training_padded,y_train,batch_size=64,epochs=15,verbose=2,validation_data=(testing_padded,y_test),callbacks=[es])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 32)            96032     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 120,929
Trainable params: 120,929
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
606/606 - 22s - loss: 0.4403 - accuracy: 0.7812 - val_loss: 0.3086 - val_accuracy: 0.8716
Epoch 2/15
606/606 - 20s - loss: 0.2737 - accuracy: 0.8886 - val_loss: 0.2691 - val_accuracy: 0.8888
Epoch 3/15
606/606 - 22s - loss: 0.2280 - accuracy: 0.9108 - val_loss: 0.2576 - val_accuracy: 0.8949
Epoch 4/15
606/606 - 22s - loss: 0.2026 - accuracy: 0.9220 - val_loss: 0.2586

<tensorflow.python.keras.callbacks.History at 0x22a8645f048>

In [15]:
def prediction_text(sent):
    sent=[sent]
    seq=tokenizer.texts_to_sequences(sent)
    padded=pad_sequences(seq,maxlen=max_len,padding=padding_type, truncating=trunc_type)
    return model.predict(padded)

In [20]:
print(prediction_text('you broke my car,good job'))

[[0.03785777]]


In [21]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model2.add(Flatten())

model2.add(Dense(units=32,activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(units=10,activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(units=1,activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.01)
model2.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model2.summary()

es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=7)
model2.fit(training_padded,y_train,batch_size=64,epochs=15,verbose=2,validation_data=(testing_padded,y_test),callbacks=[es])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 32)            96000     
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                32800     
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                330       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

<tensorflow.python.keras.callbacks.History at 0x22a92ac3f88>

In [22]:
def prediction_text2(sent):
    sent=[sent]
    seq=tokenizer.texts_to_sequences(sent)
    padded=pad_sequences(seq,maxlen=max_len,padding=padding_type, truncating=trunc_type)
    return model2.predict(padded)

In [28]:
sent="you broke my car , good job"
print(prediction_text2(sent))

[[0.99999976]]
