## Data Ingestion

In [1]:
import pandas as pd

column_names = ['id', 'text', 'label']
twitter_15_train = pd.read_csv("/kaggle/input/twitter15-16/twitter15.train", sep="\t", 
                               header=None, names=column_names)
twitter_15_test = pd.read_csv("/kaggle/input/twitter15-16/twitter15.test", sep="\t", 
                              header=None, names=column_names)
twitter_15_dev = pd.read_csv("/kaggle/input/twitter15-16/twitter15.dev", sep="\t", 
                             header=None, names=column_names)

In [2]:
twitter_15_train.head()

Unnamed: 0,id,text,label
0,724703995147751424,"american family association gets 500,000 to si...",unverified
1,358591089462099968,this week's top story: george zimmerman wins f...,false
2,775672628493357057,clinton hides failing health? full disclosure ...,unverified
3,364589696573124609,fukushima: highly radioactive water seeping in...,false
4,549927969032916993,a transgender 17-year old left a suicide note ...,unverified


In [3]:
twitter_15_train.shape, twitter_15_test.shape, twitter_15_dev.shape

((1005, 3), (336, 3), (149, 3))

In [4]:
twitter_16_train = pd.read_csv("/kaggle/input/twitter15-16/twitter16.train", sep="\t", 
                               header=None, names=column_names)
twitter_16_test = pd.read_csv("/kaggle/input/twitter15-16/twitter16.test", sep="\t", 
                              header=None, names=column_names)
twitter_16_dev = pd.read_csv("/kaggle/input/twitter15-16/twitter16.dev", sep="\t", 
                             header=None, names=column_names)

In [5]:
twitter_16_train.head()

Unnamed: 0,id,text,label
0,692929779696275456,ohio lawmakers want to know why state’s epa di...,non-rumor
1,693858804279201794,poor women in india are fighting for the right...,non-rumor
2,693648684857323521,spoiler alert: leo and kate were ridiculously ...,non-rumor
3,620367840902782976,translucent butterfly - beautiful! ' URL,false
4,693939356390653952,michael oher got a text from cam newton during...,non-rumor


In [6]:
twitter_16_train.shape, twitter_16_test.shape, twitter_16_dev.shape

((552, 3), (184, 3), (82, 3))

In [7]:
twitter_train = pd.concat([twitter_15_train, twitter_16_train], ignore_index=True, axis=0)
twitter_test = pd.concat([twitter_15_test, twitter_16_test], ignore_index=True, axis=0)
twitter_dev = pd.concat([twitter_15_dev, twitter_15_test], ignore_index=True, axis=0)

In [8]:
twitter_train.shape, twitter_test.shape, twitter_dev.shape

((1557, 3), (520, 3), (485, 3))

## Data Preparation

In [9]:
LABELS = twitter_train['label'].unique().tolist()
train_label = twitter_train['label'].replace(LABELS, [0, 1, 2, 3]).tolist()

  train_label = twitter_train['label'].replace(LABELS, [0, 1, 2, 3]).tolist()


In [10]:
train_data = twitter_train['text'].tolist()

In [11]:
dev_data = twitter_dev['text'].tolist()
dev_label = twitter_dev['label'].replace(LABELS, [0, 1, 2, 3]).tolist()

  dev_label = twitter_dev['label'].replace(LABELS, [0, 1, 2, 3]).tolist()


In [12]:
test_data = twitter_test['text'].tolist()
test_label = twitter_test['label'].replace(LABELS, [0, 1, 2, 3]).tolist()

  test_label = twitter_test['label'].replace(LABELS, [0, 1, 2, 3]).tolist()


In [13]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class ClassificationDataset(Dataset):
    def __init__(self, texts: list[str], labels: list[int], tokenizer, max_length: int):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length, 
            return_token_type_ids=False, 
            padding="max_length",
            truncation=True, 
            return_attention_mask=True, 
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [14]:
from transformers import RobertaTokenizer, RobertaModel, AdamW

max_length = 128

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = ClassificationDataset(train_data, train_label, tokenizer, max_length)
dev_dataset = ClassificationDataset(dev_data, dev_label, tokenizer, max_length)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [15]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [16]:
test_dataset = ClassificationDataset(test_data, test_label, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Model Preparation

In [17]:
num_classes = 4

In [47]:
class RobertaClassifier(nn.Module):
    def __init__(self, roberta_model, num_classes, dropout=0.1):
        super(RobertaClassifier, self).__init__()
        self.roberta = roberta_model
        self.dropout = nn.Dropout(dropout)
        self.ffn = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.ffn(x)
        return logits

In [56]:
roberta_model = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
model = RobertaClassifier(roberta_model, num_classes)

In [58]:
learning_rate = 2e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [59]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

def calculate_metrics(y_true, y_pred, y_pred_proba, num_classes):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'f1': f1_score(y_true, y_pred, average='weighted')
    }
    
    # Calculate ROC-AUC score
    if num_classes == 2:
        metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1])
    else:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
        except:
            metrics['roc_auc'] = None
            
    return metrics

In [60]:
def evaluate(model, data_loader, device, num_classes):
    model.eval()
    all_labels = []
    all_predictions = []
    all_predictions_proba = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_predictions_proba.extend(probabilities.cpu().numpy())

    all_labels = np.array(all_labels)
    all_predictions = np.array(all_predictions)
    all_predictions_proba = np.array(all_predictions_proba)

    return calculate_metrics(all_labels, all_predictions, all_predictions_proba, num_classes)

In [61]:
def train(model, train_loader, val_loader, criterion, optimizer, device, num_epochs):
    best_val_metrics = {'f1': 0.0}
    history = {'train': [], 'val': []}

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        all_train_labels = []
        all_train_predictions = []
        all_train_predictions_proba = []

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            
            all_train_labels.extend(labels.cpu().numpy())
            all_train_predictions.extend(predicted.cpu().numpy())
            all_train_predictions_proba.extend(probabilities.cpu().detach().numpy())

        # Calculate training metrics
        all_train_labels = np.array(all_train_labels)
        all_train_predictions = np.array(all_train_predictions)
        all_train_predictions_proba = np.array(all_train_predictions_proba)
        train_metrics = calculate_metrics(all_train_labels, all_train_predictions, 
                                       all_train_predictions_proba, num_classes)
        
        # Validation phase
        val_metrics = evaluate(model, val_loader, device, num_classes)
        
        # Store metrics history
        history['train'].append({
            'loss': train_loss / len(train_loader),
            **train_metrics
        })
        history['val'].append(val_metrics)

        # Print epoch results
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print("Training Metrics:")
        print(f"Loss: {train_loss/len(train_loader):.4f}")
        for metric, value in train_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")

        # Save best model based on validation F1 score
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': val_metrics,
            }, 'best_model.pth')

    print("\nBest Validation Metrics:")
    for metric, value in best_val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")
    
    return history

In [62]:
num_epochs = 5

optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()



In [63]:
history = train(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs)

Epoch 1/5: 100%|██████████| 49/49 [00:18<00:00,  2.71it/s]
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1/5
Training Metrics:
Loss: 1.3867
Accuracy: 0.2511
Precision: 0.2470
Recall: 0.2511
F1: 0.2240
Roc_auc: 0.5087

Validation Metrics:
Accuracy: 0.3320
Precision: 0.2468
Recall: 0.3320
F1: 0.2414
Roc_auc: 0.6852


Epoch 2/5: 100%|██████████| 49/49 [00:18<00:00,  2.71it/s]



Epoch 2/5
Training Metrics:
Loss: 1.2193
Accuracy: 0.4868
Precision: 0.4890
Recall: 0.4868
F1: 0.4760
Roc_auc: 0.7243

Validation Metrics:
Accuracy: 0.5629
Precision: 0.5588
Recall: 0.5629
F1: 0.5449
Roc_auc: 0.8184


Epoch 3/5: 100%|██████████| 49/49 [00:18<00:00,  2.71it/s]



Epoch 3/5
Training Metrics:
Loss: 0.8319
Accuracy: 0.7065
Precision: 0.7050
Recall: 0.7065
F1: 0.7054
Roc_auc: 0.8896

Validation Metrics:
Accuracy: 0.7505
Precision: 0.7581
Recall: 0.7505
F1: 0.7508
Roc_auc: 0.9117


Epoch 4/5: 100%|██████████| 49/49 [00:18<00:00,  2.71it/s]



Epoch 4/5
Training Metrics:
Loss: 0.4943
Accuracy: 0.8504
Precision: 0.8512
Recall: 0.8504
F1: 0.8504
Roc_auc: 0.9590

Validation Metrics:
Accuracy: 0.8082
Precision: 0.8126
Recall: 0.8082
F1: 0.8067
Roc_auc: 0.9365


Epoch 5/5: 100%|██████████| 49/49 [00:18<00:00,  2.71it/s]



Epoch 5/5
Training Metrics:
Loss: 0.2782
Accuracy: 0.9242
Precision: 0.9248
Recall: 0.9242
F1: 0.9241
Roc_auc: 0.9861

Validation Metrics:
Accuracy: 0.8000
Precision: 0.8128
Recall: 0.8000
F1: 0.8021
Roc_auc: 0.9347

Best Validation Metrics:
Accuracy: 0.8082
Precision: 0.8126
Recall: 0.8082
F1: 0.8067
Roc_auc: 0.9365


In [67]:
import pandas as pd

pd.DataFrame(history['train'])

Unnamed: 0,loss,accuracy,precision,recall,f1,roc_auc
0,1.386712,0.251124,0.247015,0.251124,0.224048,0.508703
1,1.219316,0.486834,0.48896,0.486834,0.476036,0.724349
2,0.831911,0.706487,0.704976,0.706487,0.705443,0.889604
3,0.494267,0.850353,0.851232,0.850353,0.850406,0.959001
4,0.278219,0.924213,0.92476,0.924213,0.92405,0.986081


In [68]:
pd.DataFrame(history['val'])

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
0,0.331959,0.246793,0.331959,0.241392,0.685222
1,0.562887,0.558839,0.562887,0.544853,0.818406
2,0.750515,0.758122,0.750515,0.750767,0.911731
3,0.808247,0.812628,0.808247,0.806735,0.936492
4,0.8,0.812793,0.8,0.802088,0.934678


In [69]:
evaluate(model, test_loader, device, num_classes)

{'accuracy': 0.8057692307692308,
 'precision': 0.8156962203710432,
 'recall': 0.8057692307692308,
 'f1': 0.8076031801831552,
 'roc_auc': 0.9385079723586033}