In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd

In [2]:
dataset = pd.read_json("../Datasets/Sarcasm_Headlines_Dataset.json", lines=True)
dataset.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [12]:
column_name_to_remove = 'article_link'
dataset = dataset.drop(columns=[column_name_to_remove])

dataset.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['headline'], dataset['is_sarcastic'], test_size=0.3, random_state=42)

X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [16]:
import spacy
import pandas as pd
from keras_preprocessing.sequence import pad_sequences
import numpy as np

#use spacy
nlp = spacy.load("en_core_web_sm")

#for padding
max_length = 100

for text in X_train:
    doc = nlp(text)
    max_length = max(max_length, len(doc))

#iterate through each text column in the datset
pos_tag_dict = {'NOUN': 1, 'VERB': 2, 'ADJ': 3, 'ADV': 4, 'ADP': 5, 'PRON': 6, 'DET': 7, 'CONJ': 8, 'NUM': 9, 'PART': 10, 'INTJ': 11, 'SYM': 12, 'X': 13}
padded_pos_tags_train = []
padded_pos_tags_test = []

# POS tagging and padding for training set
for text in X_train:
    doc = nlp(text)
    pos_tags = [pos_tag_dict.get(token.pos_, 0) for token in doc]
    padded_pos_tags_train.append(pad_sequences([pos_tags], maxlen=max_length, padding='post')[0])

# POS tagging and padding for testing set
for text in X_test:
    doc = nlp(text)
    pos_tags = [pos_tag_dict.get(token.pos_, 0) for token in doc]
    padded_pos_tags_test.append(pad_sequences([pos_tags], maxlen=max_length, padding='post')[0])

X_train_pos = np.array(padded_pos_tags_train)
X_test_pos = np.array(padded_pos_tags_test)



In [23]:
from keras.layers import CuDNNLSTM
from keras.models import Sequential
from keras.layers import Embedding, Dense
from keras.optimizers import Adam
from keras_preprocessing.sequence import pad_sequences

embedding_dim = 100


pos_tag_vocab_size = len(pos_tag_dict) + 1

max_length = 100
optimizer = Adam(learning_rate=0.1)
m1 = Sequential()
m1.add(Embedding(input_dim=pos_tag_vocab_size, output_dim=embedding_dim, input_length=max_length))
m1.add(CuDNNLSTM(units=150))
m1.add(Dense(units=64, activation='relu'))
m1.add(Dense(units=64, activation='relu'))
m1.add(Dense(units=1, activation='sigmoid'))

m1.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
m1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          1400      
                                                                 
 cu_dnnlstm_3 (CuDNNLSTM)    (None, 150)               151200    
                                                                 
 dense_9 (Dense)             (None, 64)                9664      
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 166,489
Trainable params: 166,489
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Train the model
m1.fit(X_train_pos, y_train, epochs=20, batch_size=64, validation_data=(X_test_pos, y_test))

# Evaluate the model
loss, accuracy = m1.evaluate(X_test_pos, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss: 0.6877213716506958, Accuracy: 55.46%


In [None]:
from sklearn.metrics import precision_score, recall_score

#predict on val data
y_val_pred_prob_m1 = m1.predict(X_test)
y_val_pred_m1 = (y_val_pred_prob_m1 > 0.5).astype(int)

y_val_true_m1 = y_test

#calculate precision and recall for binary classification
precision_m1 = precision_score(y_val_true_m1, y_val_pred_m1)
recall_m1 = recall_score(y_val_true_m1, y_val_pred_m1)

# print the results
print(f'Precision: {precision_m1:.4f}')
print(f'Recall: {recall_m1:.4f}')