<a href="https://colab.research.google.com/github/Sckarge/Multimodal_sarcasm_detection/blob/main/BERT-base/notebooks/BERT_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install pandas
%pip install matplotlib
%pip install transformers
%pip install torch torchvision torchaudio
%pip install tqdm
%pip install -U scikit-learn

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.6 MB/s[0m eta [36m0:00:0

In [2]:
import numpy as np
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import os

In [3]:
mustard_txt = pd.read_json("/content/drive/MyDrive/Research Papers/Datasets/mustard.json")
mustard_txt = mustard_txt.transpose()

### Preprocessing

In [4]:
sarcasm_label = pd.DataFrame(mustard_txt['sarcasm'],columns=['sarcasm'])
sarcasm_label['sarcasm'] = sarcasm_label['sarcasm'].apply(lambda x: 1 if x == True else 0)
txt_features = mustard_txt.drop(columns='sarcasm')

In [5]:
txt_features_list = list(txt_features['utterance'])
sarcasm_label_list = list(sarcasm_label['sarcasm'])
print(f"Text features: {txt_features}\nSarcasm label:{sarcasm_label}")

Text features:                                               utterance   speaker  \
160   It's just a privilege to watch your mind at work.   SHELDON   
170   I don't think I'll be able to stop thinking ab...     PENNY   
180   Since it's not bee season, you can have my epi...   SHELDON   
190   Lois Lane is falling, accelerating at an initi...   SHELDON   
1105  I'm just inferring this is a couch because the...   SHELDON   
...                                                 ...       ...   
2169  Hes not right for the part, and if I suggest h...  CHANDLER   
2235  Oh yeah he has a caretaker his older brother, ...  CHANDLER   
234   Is it me or the greetings gone downhill around...  CHANDLER   
2608  You are right, by saying nice, I am virtually ...  CHANDLER   
2524            Yes and we are "very" excited about it.  CHANDLER   

                                                context  \
160   [I never would have identified the fingerprint...   
170   [This is one of my favorite plac

In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(txt_features_list, sarcasm_label_list, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Model setup

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
num_classes = 2  # Adjust this based on your task's number of classes
additional_layer = nn.Linear(in_features=model.config.hidden_size, out_features=num_classes)
model.classifier.add_module("additional_layer", additional_layer)

### Setting up a data loader

In [9]:
max_seq_length = 128  # Choose an appropriate sequence length
tokenized_texts = [tokenizer.encode(text, max_length=max_seq_length, pad_to_max_length=True, truncation=True) for text in X_train]



In [10]:
# Convert to tensors
input_ids = torch.tensor([text for text in tokenized_texts], dtype=torch.long)
attention_masks = (input_ids != 0).int()  # Create attention masks
labels = torch.tensor(y_train, dtype=torch.long)

In [11]:
# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
batch_size = 32  # Choose an appropriate batch size
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

### Model training

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# Define hyperparameters
learning_rate = os.environ.get("LEARNING_RATE") or 1e-5
num_epochs = os.environ.get("NUM_EPOCHS") or int(input("Epochs: "))
batch_size = os.environ.get("BATCH_SIZE") or int(input("Batch size: "))

Epochs: 30
Batch size: 16


In [14]:
# Set up optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(data_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids, attention_mask, labels = batch

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate loss
        loss = criterion(logits, labels)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), "fine_tuned_bert_model.pth")

Epoch 1/30: 100%|██████████| 18/18 [14:03<00:00, 46.88s/it]


Epoch 1/30, Average Loss: 0.7124


Epoch 2/30: 100%|██████████| 18/18 [13:15<00:00, 44.17s/it]


Epoch 2/30, Average Loss: 0.6783


Epoch 3/30:  17%|█▋        | 3/18 [02:20<11:43, 46.92s/it]