In [39]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection")

Skipping, found downloaded files in "./news-headlines-dataset-for-sarcasm-detection" (use force=True to force download)


In [40]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Available Device: ", device)

Available Device:  cuda


In [41]:
data_df = pd.read_json("news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines = True)
data_df.dropna(inplace=True)
data_df.drop_duplicates(inplace=True)
data_df.drop(["article_link"], inplace=True, axis=1)
print(data_df.shape)
data_df.head()

(26708, 2)


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(np.array(data_df["headline"]), np.array(data_df["is_sarcastic"]), train_size=0.7)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, train_size=0.5)

print("Training Size:",X_train.shape[0], "Which is : ", round(X_train.shape[0] / data_df["headline"].__len__() * 100, 2), "%")
print("val Size:",X_val.shape[0], "Which is : ", round(X_val.shape[0] / data_df["headline"].__len__() * 100, 2), "%")
print("test Size:",X_test.shape[0], "Which is : ", round(X_test.shape[0] / data_df["headline"].__len__() * 100, 2), "%")

Training Size: 18695 Which is :  70.0 %
val Size: 4006 Which is :  15.0 %
test Size: 4007 Which is :  15.0 %


In [43]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [44]:
class dataset(Dataset):
    def __init__(self, X, Y):
        self.X = [tokenizer(x,
                            max_length=100,
                            truncation=True,
                            padding="max_length",
                            return_tensors='pt').to(device)
                            for x in X
                            ]
        self.Y = torch.tensor(Y, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    
train_dataset = dataset(X_train, Y_train)
val_dataset = dataset(X_val, Y_val)
test_dataset = dataset(X_test, Y_test)

In [45]:
batch_size = 32
epochs = 10
Lr = 1e-4

In [46]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [47]:
class Model(nn.Module):
    def __init__(self, bert):
        super(Model, self).__init__()

        self.bert = bert
        self.dropout = nn.Dropout(0.25)
        self.linear1 = nn.Linear(768, 384)
        self.linear2 = nn.Linear(384, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_masks):
        x = self.bert(input_ids, attention_masks, return_dict=False)[0][:,0]
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.sigmoid(x)
        return x


In [54]:
for param in bert_model.parameters():
    param.requires_grad = False
model = Model(bert_model).to(device)

In [55]:
print(model)

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [56]:
criteration = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=Lr)

In [None]:
total_loss_train_plot = []
total_loss_val_plot = []
total_acc_train_plot = []
total_acc_val_plot = []

pbar = tqdm(range(epochs), desc='Training', unit='epoch', leave=True)

for epoch in pbar:
    total_loss_train = 0
    total_acc_train = 0
    total_loss_val = 0
    total_acc_val = 0

    for index, data in enumerate(train_dataloader):

        inputs, labels = data
        inputs.to(device)
        labels.to(device)

        outputs = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
        batch_loss = criteration(outputs, labels)

        total_loss_train += batch_loss.item()

        train_acc = (outputs.round() == labels).sum().item()
        total_acc_train += train_acc

        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        for index, data  in enumerate(val_dataloader):

            inputs, labels = data
            inputs.to(device)
            labels.to(device)

            outputs = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
            batch_loss = criteration(outputs, labels)

            total_loss_val += batch_loss.item()

            val_acc = (outputs.round() == labels).sum().item()
            total_acc_val += val_acc

    total_loss_train_plot.append(round(total_loss_train * batch_size/len(train_dataset), 4))
    total_loss_val_plot.append(round(total_loss_val * batch_size/len(val_dataset), 4))
    total_acc_train_plot.append(round(total_acc_train/len(train_dataset)*100, 4))
    total_acc_val_plot.append(round(total_acc_val/len(val_dataset)*100, 4))

    # 更新tqdm描述（关键修改点）
    pbar.set_description(
        f'Epoch {epoch+1}/{epochs} | '
        f'Train: L {total_loss_train_plot[epoch]:.4f} ● A {total_acc_train_plot[epoch]:.2f}% | '
        f'Val: L {total_loss_val_plot[epoch]:.4f} ● A {total_acc_val_plot[epoch]:.2f}%',
        refresh=True
    )

Epoch 10/10 | Train: L 0.0003 ● A 87.17% | Val: L 0.0003 ● A 86.12%: 100%|██████████| 10/10 [10:56<00:00, 65.62s/epoch]


In [62]:
with torch.no_grad():
    total_loss_test = 0
    total_acc_test = 0
    for index, data  in enumerate(test_dataloader):

        inputs, labels = data
        inputs.to(device)
        labels.to(device)

        outputs = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
        batch_loss = criteration(outputs, labels)

        total_loss_test += batch_loss.item()

        test_acc = (outputs.round() == labels).sum().item()
        total_acc_test += test_acc

    print(f"Loss: {round(total_loss_test * batch_size/len(test_dataset), 4)}")
    print(f"Acc: {round(total_acc_test/len(test_dataset)*100, 4)}")

Loss: 0.3185
Acc: 86.0744
