<a href="https://colab.research.google.com/github/ShekinahP200/Sarcasm_detection/blob/main/nlp_Sarcasm_detection_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch keras pandas numpy scikit-learn




In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout


In [10]:
data_1 = pd.read_json("/content/Sarcasm_Headlines_Dataset.json", lines=True)
data_2 = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines=True)
data =  pd.concat([data_1, data_2])
data.head()

# Separate headlines and labels
head_lines = data['headline'].values
sentiment = data['is_sarcastic'].values

# Split data into train and test sets
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(head_lines, sentiment, test_size=0.2, random_state=42)
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(head_lines, sentiment, test_size=0.2, random_state=42)


In [11]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences for BERT
max_length = 128  # Adjust as per your requirements

def tokenize_sequences(headlines):
    input_ids = []
    attention_masks = []

    for headline in headlines:
        encoded_dict = tokenizer.encode_plus(
                            headline,                      # Sentence to encode.
                            add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
                            max_length = max_length,       # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,  # Construct attn. masks.
                            return_tensors = 'pt',         # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize and prepare input tensors for BERT
X_train_ids, X_train_masks = tokenize_sequences(X_train_bert)
X_test_ids, X_test_masks = tokenize_sequences(X_test_bert)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import numpy as np

# Define BERT model
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert.to(device)

# Create DataLoader for BERT
batch_size = 16  # Reduced from 32 for debugging

# Use a smaller subset of the data for quick debugging
sample_size = 1000
X_train_sample = X_train_ids[:sample_size]
X_train_masks_sample = X_train_masks[:sample_size]
y_train_sample = torch.tensor(y_train_bert[:sample_size])

train_data = TensorDataset(X_train_sample, X_train_masks_sample, y_train_sample)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

X_test_sample = X_test_ids[:sample_size]
X_test_masks_sample = X_test_masks[:sample_size]
y_test_sample = torch.tensor(y_test_bert[:sample_size])

test_data = TensorDataset(X_test_sample, X_test_masks_sample, y_test_sample)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Optimizer and learning rate scheduler
optimizer = AdamW(model_bert.parameters(), lr=2e-5, eps=1e-8)
epochs = 1  # Reduced from 3 for quick debugging
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop for debugging
for epoch in range(epochs):
    model_bert.train()
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        if step > 10:  # Limit the number of steps for debugging
            break
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model_bert(**inputs)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_bert.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

# Evaluation on test set
model_bert.eval()
total_eval_accuracy = 0
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model_bert(**inputs)
    logits = outputs[1].detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1).flatten()
    total_eval_accuracy += accuracy_score(predictions, batch[2].cpu().numpy())

avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print(f'Accuracy on test set: {avg_val_accuracy}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Average Training Loss: 0.11777066427563863
Accuracy on test set: 0.6170634920634921


In [16]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
# Tokenize and pad sequences for LSTM
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(X_train_lstm)
sequences = tokenizer_obj.texts_to_sequences(X_train_lstm)
word_index = tokenizer_obj.word_index
vocab_size = len(word_index) + 1
max_length_lstm = 100  # Adjust as per your requirements

lines_pad = pad_sequences(sequences, maxlen=max_length_lstm, padding='post')

# Prepare test set for LSTM
test_sequences = tokenizer_obj.texts_to_sequences(X_test_lstm)
test_review_pad = pad_sequences(test_sequences, maxlen=max_length_lstm, padding='post')

# Load pre-trained word embeddings if available
embedding_dim = 100  # Adjust as per your embeddings dimension
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Define LSTM model architecture
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length_lstm, trainable=False))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.25))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile LSTM model
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training LSTM model
history_lstm = model_lstm.fit(lines_pad, y_train_lstm, epochs=10, batch_size=32, validation_data=(test_review_pad, y_test_lstm), verbose=2)


Epoch 1/10
1384/1384 - 235s - loss: 0.6894 - accuracy: 0.5443 - val_loss: 0.6917 - val_accuracy: 0.5312 - 235s/epoch - 170ms/step
Epoch 2/10
1384/1384 - 226s - loss: 0.6894 - accuracy: 0.5443 - val_loss: 0.6915 - val_accuracy: 0.5312 - 226s/epoch - 163ms/step
Epoch 3/10
1384/1384 - 240s - loss: 0.6893 - accuracy: 0.5443 - val_loss: 0.6922 - val_accuracy: 0.5312 - 240s/epoch - 174ms/step
Epoch 4/10
1384/1384 - 232s - loss: 0.6894 - accuracy: 0.5443 - val_loss: 0.6914 - val_accuracy: 0.5312 - 232s/epoch - 168ms/step
Epoch 5/10
1384/1384 - 236s - loss: 0.6893 - accuracy: 0.5443 - val_loss: 0.6915 - val_accuracy: 0.5312 - 236s/epoch - 170ms/step
Epoch 6/10
1384/1384 - 233s - loss: 0.6893 - accuracy: 0.5443 - val_loss: 0.6917 - val_accuracy: 0.5312 - 233s/epoch - 169ms/step
Epoch 7/10
1384/1384 - 229s - loss: 0.6893 - accuracy: 0.5443 - val_loss: 0.6919 - val_accuracy: 0.5312 - 229s/epoch - 166ms/step
Epoch 8/10
1384/1384 - 228s - loss: 0.6893 - accuracy: 0.5443 - val_loss: 0.6915 - val_acc

In [24]:
def ensemble_predict(headline):
    # BERT Prediction
    inputs = tokenizer.encode_plus(headline, add_special_tokens=True, max_length=max_length, pad_to_max_length=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model_bert(input_ids, attention_mask=attention_mask)
    logits_bert = outputs[0].detach().cpu().numpy()
    pred_bert = np.argmax(logits_bert, axis=1).flatten()

    # LSTM Prediction
    sequence = tokenizer_obj.texts_to_sequences([headline])
    review_pad = pad_sequences(sequence, maxlen=max_length_lstm, padding='post')
    lstm_pred = model_lstm.predict(review_pad)[0]

    # Ensemble Prediction
    ensemble_pred = (logits_bert[0][1] + lstm_pred) / 2

    if ensemble_pred >= 0.5:
        return "It's a sarcasm!"
    else:
        return "It's not a sarcasm."


In [25]:
ensemble_predict("Fantastic, the printer broke down again just when I needed it most")



"It's not a sarcasm."

In [19]:
ensemble_predict("I enjoy going for a walk in the park on a sunny day.")



"It's not a sarcasm."

In [20]:
ensemble_predict("Sure, because I have nothing better to do than listen to your complaints.")



"It's not a sarcasm."

In [21]:
ensemble_predict("Wow, what a surprise, my favorite TV show got canceled.")



"It's not a sarcasm."

In [23]:
ensemble_predict("Oh great, another rainy day. Just what I needed!")



"It's not a sarcasm."