### Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
from wordcloud import WordCloud
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

## Loading the Dataset

In [None]:
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines = True)
df.head()

## Preprocessing the Dataset

In [None]:
df["source"] = df["article_link"].apply(lambda string: re.findall('[a-zA-Z]+', string)[2])
df = df.drop(['article_link'], axis = 1)
df.head()

In [None]:
df['headline'] = df['headline'].str.lower()
df['headline'] = df['headline'].apply(lambda x: re.sub('[^a-zA-Z\s]+|X{2,}', '', x))
df['headline'] = df['headline'].apply(lambda x: re.sub("[@\^&\*$]|#\S+|\S+[a-z0-9]\.(com|net|org)", " ",x))

In [None]:
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(text,contractions_dict = contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
df['headline'] = df['headline'].apply(lambda x: expand_contractions(x))
df['CharCount'] = df['headline'].apply(lambda x: len(x))
df['WordCount'] = df['headline'].apply(lambda x: len(x.split()))

df.head(10)

## Visualization of the Data

In [None]:
plt.figure(figsize = (10, 8))
plt.title('Length of sentence Distribution', fontsize = 15)
plt.xlabel("Length", fontsize = 12)
plt.ylabel("The Number of Sentence", fontsize = 12)

sns.histplot(df, x = 'WordCount', kde = True, bins = 80)

In [None]:
fig = px.histogram(df['WordCount'], color = df['is_sarcastic'], labels = {'value': 'WordCount', 'color': 'is_sarcastic'}, color_discrete_map = {0: "skyblue", 1: "gray"}, marginal='box')
fig.update_traces(marker = dict(line = dict(color = '#000000', width = 2)))
fig.update_layout(title_text = 'Distribution of the sentence length and sarcastic',
                  title_x = 0.5, title_font = dict(size = 20))
fig.update_layout(barmode = 'overlay')
fig.show()

In [None]:
plt.figure(figsize = (10, 8))
sns.countplot(x = df.is_sarcastic, data = df, saturation = 0.5)
plt.title('Sarcastism Distribution', fontsize = 12)
plt.xlabel("Label", fontsize = 12)
plt.ylabel("The Number of label", fontsize = 12)

In [None]:
plt.figure(figsize = (10, 8))
sns.countplot(x = df.source, data = df, saturation = 0.65)
plt.title('Source data Distribution', fontsize = 12)
plt.xlabel("Source", fontsize = 12)
plt.ylabel("Count", fontsize = 12)

In [None]:
def get_top_nwords(x, n, i):
    vec = CountVectorizer(ngram_range=(i, i)).fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

words = get_top_nwords(df.headline, 20, 2) 
df_bi = pd.DataFrame(words, columns=['Bigram', 'Frequency'])
df_bi.head()

plt.figure(figsize = (15, 8))
sns.set_theme(style = "whitegrid")
ax = sns.barplot(y = 'Bigram', x = 'Frequency', orient = "h", data = df_bi)

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["one", "first", "will", "want", "give"])


texts = " ".join(text for text in df.headline)
print ("There are {} words in the combination of headlines.".format(len(texts)))
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size = 50, max_words = 100, stopwords = stopwords, background_color = "white").generate(texts)

# Display the generated image:
plt.figure(figsize = (10, 8))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis("off")
plt.show()

# Save the image in the img folder:
wordcloud.to_file("first_review.png")

## Setting the Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, X, Y):
        self.labels = np.array(Y)
        self.texts = [tokenizer(text, padding = 'max_length', max_length = 512, return_tensors = "pt") for text in X]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout = 0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(128, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

## Training and Evaluating the Model

In [None]:
def train(model, X, Y, learning_rate, epochs, batch_size):

    train = Dataset(X, Y)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle = True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr = learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
           
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(X): .3f} | Train Accuracy: {total_acc_train / len(X): .3f}')

In [None]:
def evaluate(model, X, Y, batch_size):

    test = Dataset(X, Y)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size, shuffle = False)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    y_pred = []
    with torch.no_grad():
        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            y_pred.append(output.argmax(dim=1))

    return y_pred

## Initializing the Data

In [None]:
X = df.headline
Y = df.is_sarcastic

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [None]:
EPOCHS = 5
batch_size = 32
model = BertClassifier()
LR = 1e-4 # 0.0001
              
train(model, X_train, y_train, LR, EPOCHS, batch_size)

## Results

In [None]:
y_pred = evaluate(model, X_test, y_test, batch_size)
y_pred_ = torch.cat(y_pred, dim = 0)
y_pred_ = y_pred_.cpu().detach().numpy()
print(classification_report(y_test.values, y_pred_))