# Train a model to classify DisasterDataset

dataset source: [nlp-getting-started](https://www.kaggle.com/competitions/nlp-getting-started)

First import some related packages

In [1]:
import warnings

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

from meter import AverageMeter, CumsumMeter

warnings.filterwarnings("ignore")

Compare to train with LSTM, we only have to change the tokenizer, ModelModule and a little hyper-parameters.

In [2]:
class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=54):
        self.texts = df.text.values.tolist()
        if "target" in df.columns:
            self.target = df.target.values.tolist()
        else:
            self.target = None
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        # THIS LINE
        x = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        x = {k: v.squeeze(0) for k, v in x.items()}
        if self.target is None:
            return x
        return x, torch.FloatTensor([self.target[item]])

In [12]:
from pooler import AttentionPooling, MeanMaxPooling, LSTMPooling


class DebertaModel(nn.Module):
    def __init__(self):
        super(DebertaModel, self).__init__()
        self.backbone = AutoModel.from_pretrained("microsoft/deberta-v3-base")
        # self.pooler = lambda x: torch.mean(x, dim=1)
        # The pooler
        self.pooler = AttentionPooling(hidden_size=self.backbone.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.classification_head = nn.Linear(self.backbone.config.hidden_size, 1)
    
    def forward(self, x):
        last_hidden_state = self.backbone(**x).last_hidden_state
        feature = self.pooler(last_hidden_state)
        feature = self.dropout(feature)
        logits = self.classification_head(feature)
        return logits

In [4]:
def train_one_epoch(model, train_loader, optimizer, scheduler, criterion, epoch):
    model.train()
    progress_bar = tqdm(train_loader, total=len(train_loader))
    optimizer.zero_grad()
    batch_losses = AverageMeter()
    batch_score = CumsumMeter(metrics.f1_score)
    progress_bar.set_description(f"epoch {epoch}")
    for x, y in progress_bar:
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        batch_losses.update(loss.item(), n=y.shape[0])
        batch_score.update(y.detach().cpu().numpy(), (logits.detach().sigmoid().cpu().numpy() > 0.42966).astype(int))
        progress_bar.set_postfix({"loss": batch_losses.avg, "f1": batch_score.score})
        optimizer.zero_grad()
    return batch_losses.avg, batch_score.score


@torch.no_grad()
def evaluate(model, val_loader, criterion):
    model.eval()
    batch_losses = AverageMeter()
    batch_score = CumsumMeter(metrics.f1_score)
    progress_bar = tqdm(val_loader, total=len(val_loader))
    for x, y in progress_bar:
        logits = model(x)
        loss = criterion(logits, y)
        batch_losses.update(loss.item(), n=y.shape[0])
        batch_score.update(y.detach().cpu().numpy(), (logits.detach().sigmoid().cpu().numpy() > 0.42966).astype(int))
        progress_bar.set_postfix({"loss": batch_losses.avg, "f1": batch_score.score})
    return batch_losses.avg, batch_score.score

In [5]:
df = pd.read_csv("./train.csv")
print(df.target.value_counts(normalize=True))
df.head(3)

0    0.57034
1    0.42966
Name: target, dtype: float64


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [6]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df.target)
print(df_train.target.value_counts(normalize=True))
print(df_val.target.value_counts(normalize=True))

0    0.570279
1    0.429721
Name: target, dtype: float64
0    0.570584
1    0.429416
Name: target, dtype: float64


In [7]:
# THIS LINE
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
train_set = DisasterDataset(df_train, tokenizer=tokenizer)
val_set = DisasterDataset(df_val, tokenizer=tokenizer)

train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=32 * 2, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
model = DebertaModel()
# THIS LINE
epochs = 5
learning_rate = 2e-5
total_steps = len(train_loader) * epochs
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.BCEWithLogitsLoss()

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# 0.697
# 0.825
for epoch in range(epochs):
    train_one_epoch(model, train_loader, optimizer, scheduler, criterion, epoch)
    _ = evaluate(model, val_loader, criterion)

epoch 0: 100%|█████████████████████████████████████████████████████████████████████████| 191/191 [14:24<00:00,  4.52s/it, loss=0.462, f1=0.745]
100%|█████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:36<00:00,  1.53s/it, loss=0.374, f1=0.82]
epoch 1:   5%|███▊                                                                      | 10/191 [00:49<14:54,  4.94s/it, loss=0.331, f1=0.845]


KeyboardInterrupt: 