## Data Ingestion

In [1]:
import pandas as pd

column_names = ['id', 'text', 'label']
weibo_train = pd.read_csv("/kaggle/input/weibo-dataset/weibo.train", sep="\t", 
                               header=None, names=column_names)
weibo_test = pd.read_csv("/kaggle/input/weibo-dataset/weibo.test", sep="\t", 
                              header=None, names=column_names)
weibo_dev = pd.read_csv("/kaggle/input/weibo-dataset/weibo.dev", sep="\t", 
                             header=None, names=column_names)

In [2]:
weibo_test.dropna(inplace=True)

In [3]:
weibo_train.head()

Unnamed: 0,id,text,label
0,3910552407639880,【教育部新规拟要求高校辅导员必须是中共党员】教育部日前在官网公布了《普通高等学校辅导员队伍建...,non-rumor
1,3536960792696376,【民生】特大好消息：继北京出台95岁老人看病不要钱后，山东泰安宣布100岁可免费登泰山，初步...,false
2,3921542276460163,【被疯传的投影舞 看哭了[泪]】这段不足4分钟的舞蹈由8位舞者共同完成，舞出一对情侣从相识到...,non-rumor
3,3922114622540868,#突发#【深圳一处工业园区被曝遭山体滑坡掩埋】据网友曝料，深圳光明新区红坳村柳溪工业园发生山...,non-rumor
4,3911293176451011,【“朝阳群众”注册人数已达13万！[赞]】他们大隐隐于市，不露声色却屡建奇功……目前登记在册...,non-rumor


In [4]:
weibo_train.shape, weibo_test.shape, weibo_dev.shape

((3147, 3), (1049, 3), (467, 3))

In [6]:
weibo_train['label'].value_counts()

label
non-rumor    1587
false        1560
Name: count, dtype: int64

In [7]:
weibo_test['label'].value_counts()

label
non-rumor    529
false        520
Name: count, dtype: int64

In [8]:
weibo_dev['label'].value_counts()

label
non-rumor    235
false        232
Name: count, dtype: int64

## Data Preparation

In [9]:
LABELS = weibo_train['label'].unique().tolist()
train_label = weibo_train['label'].replace(LABELS, [0, 1]).tolist()

  train_label = weibo_train['label'].replace(LABELS, [0, 1]).tolist()


In [10]:
LABELS

['non-rumor', 'false']

In [11]:
train_data = weibo_train['text'].tolist()

In [12]:
dev_data = weibo_dev['text'].tolist()
dev_label = weibo_dev['label'].replace(LABELS, [0, 1]).tolist()

  dev_label = weibo_dev['label'].replace(LABELS, [0, 1]).tolist()


In [13]:
test_data = weibo_test['text'].tolist()
test_label = weibo_test['label'].replace(LABELS, [0, 1]).tolist()

  test_label = weibo_test['label'].replace(LABELS, [0, 1]).tolist()


In [14]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class ClassificationDataset(Dataset):
    def __init__(self, texts: list[str], labels: list[int], tokenizer, max_length: int):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length, 
            return_token_type_ids=False, 
            padding="max_length",
            truncation=True, 
            return_attention_mask=True, 
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [15]:
from transformers import BartTokenizer, BartModel, AdamW

max_length = 128

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
train_dataset = ClassificationDataset(train_data, train_label, tokenizer, max_length)
dev_dataset = ClassificationDataset(dev_data, dev_label, tokenizer, max_length)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]



In [16]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [17]:
test_dataset = ClassificationDataset(test_data, test_label, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Model Preparation

In [18]:
num_classes = 2

In [19]:
class BARTClassifier(nn.Module):
    def __init__(self, bart_model, num_classes, dropout=0.1):
        super(BARTClassifier, self).__init__()
        self.bart = bart_model
        self.dropout = nn.Dropout(dropout)
        self.ffn = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
        
        last_hidden_state = outputs.last_hidden_state  
        
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_hidden_states = torch.sum(last_hidden_state * attention_mask_expanded, 1)
        sum_attention_mask = torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
        pooled_output = sum_hidden_states / sum_attention_mask
        
        x = self.dropout(pooled_output)
        logits = self.ffn(x)
        return logits

In [20]:
bart_model = BartModel.from_pretrained('facebook/bart-base')

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [21]:
model = BARTClassifier(bart_model, num_classes)

In [22]:
learning_rate = 2e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BARTClassifier(
  (bart): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): Layer

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from tqdm import tqdm

def calculate_metrics(y_true, y_pred, y_pred_proba, num_classes):
    if num_classes == 2:
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred)
        }
    else:
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average='weighted'),
            'recall': recall_score(y_true, y_pred, average='weighted'),
            'f1': f1_score(y_true, y_pred, average='weighted')
        }
    
    # Calculate ROC-AUC score
    if num_classes == 2:
        metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
    else:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
        except:
            metrics['roc_auc'] = None
            
    return metrics

In [24]:
def evaluate(model, data_loader, device, num_classes):
    model.eval()
    all_labels = []
    all_predictions = []
    all_predictions_proba = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_predictions_proba.extend(probabilities.cpu().numpy())

    all_labels = np.array(all_labels)
    all_predictions = np.array(all_predictions)
    all_predictions_proba = np.array(all_predictions_proba)

    return calculate_metrics(all_labels, all_predictions, all_predictions_proba, num_classes)

In [25]:
def train(model, train_loader, val_loader, criterion, optimizer, device, num_epochs):
    best_val_metrics = {'f1': 0.0}
    history = {'train': [], 'val': []}

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        all_train_labels = []
        all_train_predictions = []
        all_train_predictions_proba = []

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_train_labels.extend(labels.cpu().numpy())
            all_train_predictions.extend(predicted.cpu().numpy())
            all_train_predictions_proba.extend(probabilities.cpu().detach().numpy())

        # Calculate training metrics
        all_train_labels = np.array(all_train_labels)
        all_train_predictions = np.array(all_train_predictions)
        all_train_predictions_proba = np.array(all_train_predictions_proba)
        train_metrics = calculate_metrics(all_train_labels, all_train_predictions, 
                                       all_train_predictions_proba, num_classes)
        
        # Validation phase
        val_metrics = evaluate(model, val_loader, device, num_classes)
        
        # Store metrics history
        history['train'].append({
            'loss': train_loss / len(train_loader),
            **train_metrics
        })
        history['val'].append(val_metrics)

        # Print epoch results
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print("Training Metrics:")
        print(f"Loss: {train_loss/len(train_loader):.4f}")
        for metric, value in train_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")
    
    return history

In [26]:
num_epochs = 5

optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()



In [27]:
history = train(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs)

Epoch 1/5: 100%|██████████| 99/99 [00:49<00:00,  2.00it/s]



Epoch 1/5
Training Metrics:
Loss: 0.6500
Accuracy: 0.6018
Precision: 0.5973
Recall: 0.6038
F1: 0.6006
Roc_auc: 0.6582

Validation Metrics:
Accuracy: 0.7173
Precision: 0.7976
Recall: 0.5776
F1: 0.6700
Roc_auc: 0.7891


Epoch 2/5: 100%|██████████| 99/99 [00:44<00:00,  2.21it/s]



Epoch 2/5
Training Metrics:
Loss: 0.5577
Accuracy: 0.7038
Precision: 0.7071
Recall: 0.6872
F1: 0.6970
Roc_auc: 0.7850

Validation Metrics:
Accuracy: 0.7430
Precision: 0.7222
Recall: 0.7845
F1: 0.7521
Roc_auc: 0.8373


Epoch 3/5: 100%|██████████| 99/99 [00:45<00:00,  2.19it/s]



Epoch 3/5
Training Metrics:
Loss: 0.4695
Accuracy: 0.7728
Precision: 0.7672
Recall: 0.7776
F1: 0.7724
Roc_auc: 0.8574

Validation Metrics:
Accuracy: 0.7987
Precision: 0.7634
Recall: 0.8621
F1: 0.8097
Roc_auc: 0.8725


Epoch 4/5: 100%|██████████| 99/99 [00:44<00:00,  2.21it/s]



Epoch 4/5
Training Metrics:
Loss: 0.4262
Accuracy: 0.8033
Precision: 0.7987
Recall: 0.8064
F1: 0.8026
Roc_auc: 0.8859

Validation Metrics:
Accuracy: 0.7966
Precision: 0.8216
Recall: 0.7543
F1: 0.7865
Roc_auc: 0.8904


Epoch 5/5: 100%|██████████| 99/99 [00:44<00:00,  2.21it/s]



Epoch 5/5
Training Metrics:
Loss: 0.3638
Accuracy: 0.8421
Precision: 0.8440
Recall: 0.8359
F1: 0.8399
Roc_auc: 0.9178

Validation Metrics:
Accuracy: 0.8351
Precision: 0.8216
Recall: 0.8534
F1: 0.8372
Roc_auc: 0.9052


In [28]:
import pandas as pd

pd.DataFrame(history['train'])

Unnamed: 0,loss,accuracy,precision,recall,f1,roc_auc
0,0.64996,0.601843,0.597337,0.603846,0.600574,0.658179
1,0.557715,0.703845,0.707124,0.687179,0.697009,0.78498
2,0.469478,0.772799,0.767236,0.777564,0.772365,0.8574
3,0.426223,0.803305,0.79873,0.80641,0.802552,0.885862
4,0.363837,0.842072,0.844013,0.835897,0.839936,0.917802


In [29]:
pd.DataFrame(history['val'])

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
0,0.717345,0.797619,0.577586,0.67,0.789105
1,0.743041,0.722222,0.784483,0.752066,0.837271
2,0.798715,0.763359,0.862069,0.809717,0.872524
3,0.796574,0.821596,0.75431,0.786517,0.890371
4,0.835118,0.821577,0.853448,0.837209,0.905154


In [30]:
evaluate(model, test_loader, device, num_classes)

{'accuracy': 0.8074356530028599,
 'precision': 0.8045977011494253,
 'recall': 0.8076923076923077,
 'f1': 0.8061420345489444,
 'roc_auc': 0.8894721535553293}