In [1]:
!pip install transformers --quiet
!pip install opendatasets --quiet
import opendatasets as od
od.download("https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: tanmay01bhatt
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection
Downloading news-headlines-dataset-for-sarcasm-detection.zip to ./news-headlines-dataset-for-sarcasm-detection


100%|██████████| 3.30M/3.30M [00:00<00:00, 463MB/s]







In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("Available Device is: ", device)

Available Device is:  cuda


In [4]:
data = pd.read_json("/content/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines=True)
#Each line in the file is a separate JSON object
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
data.dropna(inplace=True)
print(data.shape)

(26709, 3)


In [6]:
X = data['headline'].values
y = data['is_sarcastic'].values

# Data split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [8]:
X_train.shape, X_val.shape, X_test.shape

((18696,), (4006,), (4007,))

# **Initialize the BERT model and tokenizer**

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #each token’s embedding vector is of size 768
bert_model = AutoModel.from_pretrained("bert-base-uncased") #Loads the pretrained BERT model (without a classification head).

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# Custom Dataset

In [10]:
class dataset(Dataset):

  def __init__(self,X,y):
    self.X = [tokenizer(x,
                        max_length=100,
                        truncation=True,
                        padding = 'max_length',
                        return_tensors='pt' ).to(device)
              for x in X

              ]
    self.y = torch.tensor(y,dtype=torch.float32).to(device)

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    return  self.X[idx],self.y[idx]

In [11]:
training_data = dataset(X_train, y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test, y_test)

In [35]:
BATCH_SIZE = 32
EPOCHS = 20
LR = 1e-4

In [13]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle= True)
validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE, shuffle= True)
testing_dataloader = DataLoader(testing_data, batch_size=BATCH_SIZE, shuffle= True)

# **Model**

In [19]:
class Model(nn.Module):

  def __init__(self,bert):
    super().__init__()

    self.bert = bert
    self.dropout = nn.Dropout(0.25)
    self.linear1 = nn.Linear(768,384)
    self.linear2 = nn.Linear(384,1)
    self.sigmoid = nn.Sigmoid()

  def forward(self,input_ids,attention_mask):

    bert_output =  self.bert(input_ids,attention_mask,return_dict=False)[0][:,0] # to get the CLS token of each sample in the batch
    output = self.linear1(bert_output)
    output = self.dropout(output)
    output = self.linear2(output)
    output = self.sigmoid(output)

    return output

In [32]:
for param in bert_model.parameters():
    param.requires_grad = False

In [28]:
model = Model(bert_model).to(device)

In [29]:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr= LR)

# Training

input['input_ids'] and input['attention_mask'] are the o/p from bert tokenizer of shape =  [batch_size, 1, seq_len]

first squeeze is on inputs before sending them to the model

second squueze is on the model o/p of shape : [batch_size,1]


since Flat 1D tensor for compatibility with loss functions

In [36]:
for epoch in range(EPOCHS):
    total_acc_train = 0
    total_loss_train = 0
    total_acc_val = 0
    total_loss_val = 0
    ## Training and Validation
    for indx, data in enumerate(train_dataloader):
        input, label = data

        input.to(device)
        label.to(device)

        prediction = model(input['input_ids'].squeeze(1),
                            input['attention_mask'].squeeze(1)).squeeze(1)


        batch_loss = criterion(prediction, label)

        total_loss_train += batch_loss.item()

        acc = ((prediction).round() == label).sum().item()
        total_acc_train += acc

        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    ## Validation
    with torch.no_grad():
        for indx, data in enumerate(validation_dataloader):
            input, label = data
            input.to(device)
            label.to(device)

            prediction = model(input['input_ids'].squeeze(1),
                            input['attention_mask'].squeeze(1)).squeeze(1)

            batch_loss_val = criterion(prediction, label)
            total_loss_val += batch_loss_val.item()


            acc = ((prediction).round() == label).sum().item()

            total_acc_val += acc

    print(f'''Epoch no. {epoch + 1} Train Loss: {total_loss_train/1000:.4f} Train Accuracy: {(total_acc_train/(training_data.__len__())*100):.4f} Validation Loss: {total_loss_val/100:.4f} Validation Accuracy: {(total_acc_val/(validation_data.__len__())*100):.4f}''')
    print("="*50)

Epoch no. 1 Train Loss: 0.0572 Train Accuracy: 96.5126 Validation Loss: 0.3065 Validation Accuracy: 90.9885
Epoch no. 2 Train Loss: 0.0532 Train Accuracy: 96.8015 Validation Loss: 0.2715 Validation Accuracy: 91.3130
Epoch no. 3 Train Loss: 0.0513 Train Accuracy: 96.9512 Validation Loss: 0.2953 Validation Accuracy: 91.2132
Epoch no. 4 Train Loss: 0.0498 Train Accuracy: 97.0208 Validation Loss: 0.3101 Validation Accuracy: 91.1882
Epoch no. 5 Train Loss: 0.0494 Train Accuracy: 97.0368 Validation Loss: 0.2646 Validation Accuracy: 91.2631
Epoch no. 6 Train Loss: 0.0485 Train Accuracy: 97.1438 Validation Loss: 0.3061 Validation Accuracy: 91.2881
Epoch no. 7 Train Loss: 0.0473 Train Accuracy: 97.2401 Validation Loss: 0.2724 Validation Accuracy: 91.5127
Epoch no. 8 Train Loss: 0.0468 Train Accuracy: 97.2561 Validation Loss: 0.2852 Validation Accuracy: 91.4878
Epoch no. 9 Train Loss: 0.0462 Train Accuracy: 97.3363 Validation Loss: 0.2867 Validation Accuracy: 91.6625
Epoch no. 10 Train Loss: 0.0

# Testing

In [37]:
with torch.no_grad():
  total_loss_test = 0
  total_acc_test = 0
  for indx, data in enumerate(testing_dataloader):
    input, label = data
    input.to(device)
    label.to(device)

    prediction = model(input['input_ids'].squeeze(1), input['attention_mask'].squeeze(1)).squeeze(1)

    batch_loss_val = criterion(prediction, label)
    total_loss_test += batch_loss_val.item()
    acc = ((prediction).round() == label).sum().item()
    total_acc_test += acc

print(f"Accuracy Score is: {round((total_acc_test/X_test.shape[0])*100, 2)}%")

Accuracy Score is: 92.56%
