# Dataset Link
[Kaggle](https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection?select=Sarcasm_Headlines_Dataset.json)

## Overview of Dataset
Dataset is collected from **TheOnion**, which aims at producing sarcastic versions of current events. Real news headlines was collected from **HuffPost**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
from pathlib import Path
import json
import re
import string
from string import punctuation
import unicodedata
import urllib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from keras.preprocessing import text, sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, GRU
from keras.optimizers import Adam
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint

import gensim



In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
path_v2 = Path("/content/drive/MyDrive/Shared With Others/Sarcasm_Headlines_Dataset_v2.json")
# with open(path_v2) as f:
#   df = json.load(f)

In [None]:
df = pd.read_json(path_v2, lines = True)
df.sample(5)

Unnamed: 0,is_sarcastic,headline,article_link
20067,0,here's who the obamas invited to the state of ...,https://www.huffingtonpost.com/entry/state-of-...
25120,0,the toy aisle is almost too much for this boy ...,https://www.huffingtonpost.com/entry/the-toy-a...
12815,1,report: it a miracle nothing has punctured you...,https://www.theonion.com/report-it-a-miracle-n...
3959,1,new report finds adult film star may have paid...,https://politics.theonion.com/new-report-finds...
26584,1,scientists claim solar energy will be capable ...,https://www.theonion.com/scientists-claim-sola...


In [None]:
del df['article_link']

# Data Prepocessing and Cleaning

In [None]:
# To remove stopwords and punctuations
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

def denoise_text(text):
  # Remove HTML Elements
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text()

  # Remove Brackets
  text = re.sub('\[[^]]*\]', '', text)
  text = re.sub(r'http\S+', '', text)

  text = remove_stopwords(text)
  return text


df['headline'] = df['headline'].apply(denoise_text)

  soup = BeautifulSoup(text, "html.parser")


# Word Embeddings
Representing Document Vocabulary, capable of capturing context of a word in a document, semantic and syntactic similarity, relation with other words, etc.

## [Word2Vec Model](https://towardsdatascience.com/introduction-to-word-embedding-and-word2vec-652d0c2060fa)

In [None]:
# Converting for Gensim Format
words = []
for i in df.headline.values:
    words.append(i.split())
words[:5]

[['thirtysomething',
  'scientists',
  'unveil',
  'doomsday',
  'clock',
  'hair',
  'loss'],
 ['dem',
  'rep.',
  'totally',
  'nails',
  'congress',
  'falling',
  'short',
  'gender,',
  'racial',
  'equality'],
 ['eat', 'veggies:', '9', 'deliciously', 'different', 'recipes'],
 ['inclement', 'weather', 'prevents', 'liar', 'getting', 'work'],
 ['mother',
  'comes',
  'pretty',
  'close',
  'using',
  'word',
  "'streaming'",
  'correctly']]

In [None]:
EMBEDDING_DIM = 200

# Creating the word vectors
w2v_model = gensim.models.Word2Vec(sentences = words , vector_size = EMBEDDING_DIM , window = 5 , min_count = 1)

In [None]:
# vocabulary Size
len(w2v_model.wv.key_to_index) # 38065 words with 100 dim vector

38065

In [None]:
# tokenizer = text.Tokenizer(num_words = 35000)
# tokenizer.fit_on_texts(words)
# tokenized_train = tokenizer.texts_to_sequences(words)
# x = sequence.pad_sequences(tokenized_train, maxlen = 20)

In [None]:
# Embedding Layer creates one more vector for "UNKNOWN" words, or padded words (0s)
# Thus our vocab size inceeases by 1

vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Create weight matrix from word2vec

def get_weight_matrix(model, vocab):
  # 0 for unknown words
  vocab_size = len(vocab) + 1
  weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
  for word, i in vocab.items():
    weight_matrix[i] = model.wv[word]

  return weight_matrix

In [None]:
# Getting embedding vectors from word2vec and usings it as weights of non-trainable keras embedding layer
embedding_vectors = get_weight_matrix(w2v_model, tokenizer.word_index)

### Training W2V Model

In [None]:
# model = Sequential()
# model.add(Embedding(vocab_size, output_dim = EMBEDDING_DIM, weights = [embedding_vectors], input_length = 20, trainable = True)) # Non trainable, for embedding

# # LSTM
# model.add(Bidirectional(LSTM(units = 128 , recurrent_dropout = 0.3 , dropout = 0.3, return_sequences = True)))
# model.add(Bidirectional(GRU(units = 32 , recurrent_dropout = 0.1 , dropout = 0.1)))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer = Adam(learning_rate = 0.01), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# del embedding_vectors

In [None]:
# model.summary()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, df.is_sarcastic , test_size = 0.25)

In [None]:
# epochs_val = 2

In [None]:
# history = model.fit(X_train, y_train, batch_size = 128 , validation_data = (X_test,y_test) , epochs = epochs_val)

In [None]:
# acc = model.evaluate(X_test,y_test)[1]

In [None]:
# print("Accuracy : " , acc * 100)

In [None]:
# epochs = [i for i in range(epochs_val)]
# fig , ax = plt.subplots(1,1)
# train_acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']

# ax.plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
# ax.plot(epochs , val_acc , 'ro-' , label = 'Testing Accuracy')
# ax.set_title('Training & Testing Accuracy')
# ax.legend()
# ax.set_xlabel("Epochs")
# ax.set_ylabel("Accuracy")

# GloVe (Attempt 2)

In [None]:
# data_1 = pd.read_json("/content/drive/MyDrive/Shared With Others/Sarcasm_Headlines_Dataset.json", lines=True)
# data_2 = pd.read_json("/content/drive/MyDrive/Shared With Others/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# data =  pd.concat([data_1, data_2])
# data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [None]:
# def clean_text(text):
#     text = text.lower()

#     pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
#     text = pattern.sub('', text)
#     text = " ".join(filter(lambda x:x[0]!='@', text.split()))
#     emoji = re.compile("["
#                            u"\U0001F600-\U0001FFFF"  # emoticons
#                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
#                            "]+", flags=re.UNICODE)

#     text = emoji.sub(r'', text)
#     text = text.lower()
#     text = re.sub(r"i'm", "i am", text)
#     text = re.sub(r"he's", "he is", text)
#     text = re.sub(r"she's", "she is", text)
#     text = re.sub(r"that's", "that is", text)
#     text = re.sub(r"what's", "what is", text)
#     text = re.sub(r"where's", "where is", text)
#     text = re.sub(r"\'ll", " will", text)
#     text = re.sub(r"\'ve", " have", text)
#     text = re.sub(r"\'re", " are", text)
#     text = re.sub(r"\'d", " would", text)
#     text = re.sub(r"\'ve", " have", text)
#     text = re.sub(r"won't", "will not", text)
#     text = re.sub(r"don't", "do not", text)
#     text = re.sub(r"did't", "did not", text)
#     text = re.sub(r"can't", "can not", text)
#     text = re.sub(r"it's", "it is", text)
#     text = re.sub(r"couldn't", "could not", text)
#     text = re.sub(r"have't", "have not", text)
#     text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
#     return text

In [None]:
# import string
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

# def CleanTokenize(df):
#     head_lines = list()
#     lines = df["headline"].values.tolist()

#     for line in lines:
#         line = clean_text(line)
#         # tokenize the text
#         tokens = word_tokenize(line)
#         # remove puntuations
#         table = str.maketrans('', '', string.punctuation)
#         stripped = [w.translate(table) for w in tokens]
#         # remove non alphabetic characters
#         words = [word for word in stripped if word.isalpha()]
#         stop_words = set(stopwords.words("english"))
#         # remove stop words
#         words = [w for w in words if not w in stop_words]
#         head_lines.append(words)
#     return head_lines

# head_lines = CleanTokenize(data)
# head_lines[0:10]

[['former',
  'versace',
  'store',
  'clerk',
  'sues',
  'secret',
  'black',
  'code',
  'minority',
  'shoppers'],
 ['roseanne',
  'revival',
  'catches',
  'thorny',
  'political',
  'mood',
  'better',
  'worse'],
 ['mom',
  'starting',
  'fear',
  'sons',
  'web',
  'series',
  'closest',
  'thing',
  'grandchild'],
 ['boehner',
  'wants',
  'wife',
  'listen',
  'come',
  'alternative',
  'debtreduction',
  'ideas'],
 ['jk', 'rowling', 'wishes', 'snape', 'happy', 'birthday', 'magical', 'way'],
 ['advancing', 'worlds', 'women'],
 ['fascinating', 'case', 'eating', 'labgrown', 'meat'],
 ['ceo', 'send', 'kids', 'school', 'work', 'company'],
 ['top', 'snake', 'handler', 'leaves', 'sinking', 'huckabee', 'campaign'],
 ['fridays', 'morning', 'email', 'inside', 'trumps', 'presser', 'ages']]

In [None]:
# validation_split = 0.2
# max_length = 25


# tokenizer_obj = text.Tokenizer()
# tokenizer_obj.fit_on_texts(head_lines)
# sequences = tokenizer_obj.texts_to_sequences(head_lines)

# word_index = tokenizer_obj.word_index
# print("unique tokens - ",len(word_index))
# vocab_size = len(tokenizer_obj.word_index) + 1
# print('vocab size -', vocab_size)

# lines_pad = sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
# sentiment =  data['is_sarcastic'].values

# indices = np.arange(lines_pad.shape[0])
# np.random.shuffle(indices)
# lines_pad = lines_pad[indices]
# sentiment = sentiment[indices]

# num_validation_samples = int(validation_split * lines_pad.shape[0])

# X_train_pad = lines_pad[:-num_validation_samples]
# y_train = sentiment[:-num_validation_samples]
# X_test_pad = lines_pad[-num_validation_samples:]
# y_test = sentiment[-num_validation_samples:]

unique tokens -  28657
vocab size - 28658


In [None]:
# print('Shape of X_train_pad:', X_train_pad.shape)
# print('Shape of y_train:', y_train.shape)

# print('Shape of X_test_pad:', X_test_pad.shape)
# print('Shape of y_test:', y_test.shape)

In [None]:
# import os

# embeddings_index = {}
# embedding_dim = 200
# GLOVE_DIR = r'/content/drive/MyDrive/Shared With Others/glove.twitter.27B.200d.txt'
# f = open(GLOVE_DIR, encoding = "utf-8")
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
# c = 0
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         c+=1
#         embedding_matrix[i] = embedding_vector
# print(c)

In [None]:
# embedding_layer = Embedding(len(word_index) + 1,
#                             embedding_dim,
#                             weights=[embedding_matrix],
#                             input_length=max_length,
#                             trainable=False)

In [None]:
# model = Sequential()
# model.add(embedding_layer)
# model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.25))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

# print('Summary of the built model...')
# print(model.summary())

In [None]:
# history = model.fit(X_train_pad, y_train, batch_size=32, epochs=2, validation_data=(X_test_pad, y_test), verbose=2)

In [None]:
# # Plot results
# acc = history.history['acc']
# val_acc = history.history['val_acc']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(acc)+1)

# plt.plot(epochs, acc, 'g', label='Training accuracy')
# plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
# plt.title('Training and validation accuracy')
# plt.legend()

# plt.figure()

# plt.plot(epochs, loss, 'g', label='Training loss')
# plt.plot(epochs, val_loss, 'r', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()

In [None]:
# def predict_sarcasm(s):
#     x_final = pd.DataFrame({"headline":[s]})
#     test_lines = CleanTokenize(x_final)
#     test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
#     test_review_pad = sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post')
#     pred = model.predict(test_review_pad)
#     pred*=100
#     if pred[0][0]>=50: return "It's a sarcasm!"
#     else: return "It's not a sarcasm."

In [None]:
# predict_sarcasm("I was depressed. He asked me to be happy. I am not depressed anymore.")

---
---
---
---

# BERT

In [None]:
inputs = df['headline'][:1000]
targets = df.is_sarcastic[:1000]

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

import json
import random
import re
import os
import urllib.request

import spacy
from scipy import spatial

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
import gensim

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import torch



model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

tokens = [tokenizer.tokenize(sentence) for sentence in inputs]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokens]

input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')

attention_mask = torch.tensor([[int(token_id > 0) for token_id in input_sequence] for input_sequence in input_ids])

input_tensor = torch.tensor(input_ids)

with torch.no_grad():
    outputs = model(input_tensor, attention_mask=attention_mask)
    embeddings = outputs[0]

In [None]:
# prompt: remove 2nd dimension of embeddings

embeddings = embeddings[:, :, 0]

In [None]:
rf_distil = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(embeddings, targets, test_size=0.2, random_state=42)


In [None]:
rf_distil = RandomForestClassifier()
rf_distil.fit(X_train, y_train)

y_pred = rf_distil.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.46
              precision    recall  f1-score   support

           0       0.47      0.63      0.53        99
           1       0.45      0.30      0.36       101

    accuracy                           0.46       200
   macro avg       0.46      0.46      0.45       200
weighted avg       0.46      0.46      0.44       200

