In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import warnings
import pandas as pd
import torch
from datasets import Dataset
# from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import  DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import numpy as np
from torch.optim import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]  # This is already a list of binary values
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)  # Convert to float for BCEWithLogitsLoss
        }

In [5]:
train_data_path = "/home/sohampoddar/HDD2/utsav/Dataset/train.csv"
val_data_path = "/home/sohampoddar/HDD2/utsav/Dataset/test.csv"
train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)

label_list = ['rushed','side-effect','ineffective','mandatory','pharma','ingredients',
              'country','conspiracy','political','unnecessary','none']
mlb = MultiLabelBinarizer(classes=label_list)

# Convert space-separated label string to list
train_data['labels'] = train_data['labels'].apply(lambda x: x.split(' '))
val_data['labels'] = val_data['labels'].apply(lambda x: x.split(' '))
# Transform to binary format (2D array)
binary_labels = mlb.fit_transform(train_data['labels'])
binary_labels_val = mlb.fit_transform(val_data['labels'])

# Store as list of 11 binary values per tweet
train_data['labels'] = binary_labels.tolist()
val_data['labels'] = binary_labels_val.tolist()

# Extract final lists
train_texts = train_data['text'].tolist()
val_texts = val_data['text'].tolist()
train_labels = train_data['labels'].tolist()
val_labels = val_data['labels'].tolist()




In [8]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert-v2")
max_len = 512
train_dataset = TweetDataset(train_texts, train_labels, tokenizer, max_len)
test_dataset = TweetDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
model_name = "digitalepidemiologylab/covid-twitter-bert-v2"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    problem_type="multi_label_classification"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

loss_fn = torch.nn.BCEWithLogitsLoss()

epochs = 6

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} complete. Avg Loss: {avg_loss:.4f}")

# --- Step 7: Evaluation ---
model.eval()
all_preds, all_labels = [], []

val_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits)
        preds = (probs > 0.5).int()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# save the model 
model.save_pretrained('covid_twitter_bert_v2_model')
# save the tokenizer 
tokenizer.save_pretrained('covid_twitter_bert_v2_tokenizer')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 complete. Avg Loss: 0.3123
Epoch 2 complete. Avg Loss: 0.1781
Epoch 3 complete. Avg Loss: 0.1314
Epoch 4 complete. Avg Loss: 0.1010
Epoch 5 complete. Avg Loss: 0.0775
Epoch 6 complete. Avg Loss: 0.0591


('covid_twitter_bert_v2_tokenizer/tokenizer_config.json',
 'covid_twitter_bert_v2_tokenizer/special_tokens_map.json',
 'covid_twitter_bert_v2_tokenizer/vocab.txt',
 'covid_twitter_bert_v2_tokenizer/added_tokens.json',
 'covid_twitter_bert_v2_tokenizer/tokenizer.json')

In [10]:
# load the model 
model = AutoModelForSequenceClassification.from_pretrained('covid_twitter_bert_v2_model',
                                                           num_labels=len(label_list),
                                                           problem_type="multi_label_classification")
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('covid_twitter_bert_v2_tokenizer')
model.to(device)

# test the model 


model.eval()
all_preds, all_labels = [], []

max_len = 512

test_dataset = TweetDataset(val_texts, val_labels, tokenizer, max_len)


val_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits)
        preds = (probs > 0.5).int()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [11]:
print(classification_report(all_labels, all_preds, target_names=label_list, digits=4))
from sklearn.metrics import classification_report, f1_score, accuracy_score
# Convert lists to NumPy arrays for metric functions
y_true = np.array(all_labels)
y_pred = np.array(all_preds)

# F1 Scores
f1_micro = f1_score(y_true, y_pred, average='micro')
f1_macro = f1_score(y_true, y_pred, average='macro')

# Accuracy (strict = all labels correct)
strict_accuracy = accuracy_score(y_true, y_pred)

print(f"F1 Micro: {f1_micro:.4f}")
print(f"F1 Macro: {f1_macro:.4f}")
print(f"Strict Accuracy: {strict_accuracy:.4f}")

              precision    recall  f1-score   support

      rushed     0.7329    0.8373    0.7816       295
 side-effect     0.8780    0.8215    0.8488       762
 ineffective     0.7238    0.8473    0.7807       334
   mandatory     0.7230    0.6815    0.7016       157
      pharma     0.7376    0.6392    0.6849       255
 ingredients     0.7727    0.5862    0.6667        87
     country     0.7647    0.6500    0.7027        40
  conspiracy     0.6833    0.4227    0.5223        97
   political     0.6961    0.5680    0.6256       125
 unnecessary     0.7130    0.5310    0.6087       145
        none     0.5492    0.5317    0.5403       126

   micro avg     0.7641    0.7260    0.7446      2423
   macro avg     0.7249    0.6470    0.6785      2423
weighted avg     0.7644    0.7260    0.7405      2423
 samples avg     0.7601    0.7473    0.7389      2423

F1 Micro: 0.7446
F1 Macro: 0.6785
Strict Accuracy: 0.6146


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
