# Train a model to classify DisasterDataset

dataset source: [nlp-getting-started](https://www.kaggle.com/competitions/nlp-getting-started)

First import some related packages

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup

from meter import AverageMeter, CumsumMeter

Before we step into develop LSTM model, let's clarify the steps of coding with `pytorch`

STEP:
1. Load data, create CV(cross validation) and tokenize into word ids.
2. Create `Dataset` class to load the data, and then make dataloader
3. Create Model class with `nn.Module`
4. Load pretrained embeddings
5. Create model training function
6. Create model instance, optimizer, scheduler, loss_function, ...
7. Train the model with data!

After clarify how we train a model, let's create some helper class and function

`Tokenizer`: tokenizer for text.

`DisasterDataset`: `pytorch dataset` that be used to load data

`load_embeddings`: load pretrained embeddings

`LSTMModel`: `pytorch model`, define how model do forward-propgation

`train_one_epoch`: 

In [2]:
class Tokenizer(object):
    def __init__(self):
        self.vocab = []
        self.word2id = {}
        self.id2word = {}
        
    def __len__(self):
        return len(self.word2id)
    
    @staticmethod
    def _text2word(text):
        # to simply the preprocessing and package requirements, use `split` here.
        # you can re-write this function to do more accurate tokenize.
        return text.split()
    
    def fit(self, texts):
        vocab = set()
        for text in texts:
            words = self._text2word(text)
            vocab |= set(words)
        vocab = list(sorted(vocab))
        self.vocab = vocab
        self.word2id = dict(zip(vocab, range(1, len(vocab) + 1)))
        self.id2word = dict(zip(range(1, len(vocab) + 1), vocab))
        # save pad token
        self.word2id["<pad>"] = 0
        self.id2word[0] = "<pad>"
    
    def tokenize(self, text, max_length=54, padding=False, padding_idx=0):
        words = self._text2word(text)
        word_ids = [self.word2id[word] for word in words if word in self.word2id]
        word_ids = word_ids[:max_length]
        if padding:
            pad_len = max_length - len(word_ids)
            word_ids = word_ids + [padding_idx] * pad_len
        return word_ids
    
    def decode(self, word_ids):
        words = [self.id2word[word_id] for word_id in word_ids if word_id in self.id2word]
        return " ".join(words)


class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=54):
        self.texts = df.text.values.tolist()
        if "target" in df.columns:
            self.target = df.target.values.tolist()
        else:
            self.target = None
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        word_ids = self.tokenizer.tokenize(text, max_length=self.max_length, padding=True)
        x = torch.LongTensor(word_ids)
        if self.target is None:
            return x
        return x, torch.FloatTensor([self.target[item]])

In [3]:
def load_embeddings(embedding_file, word2id):
    # embeddings = np.random.randn(len(word2id), 300)
    dim = int(embedding_file.split("/")[-1].split(".")[-2][:-1])
    embeddings = np.zeros((len(word2id), dim))
    with open(embedding_file, "r") as f:
        while 1:
            line = f.readline().strip()
            if not line:
                break
            word, vec = line.split(" ", 1)
            if word not in word2id:
                continue
            embeddings[word2id[word]] = np.array([float(i) for i in vec.split(" ")])
    return embeddings

In [4]:
class LSTMModel(nn.Module):
    def __init__(self, n_vocab, embedding_dim, lstm_dim=256, pretrain_embeddings=None):
        super(LSTMModel, self).__init__()
        embedding_dim = embedding_dim if pretrain_embeddings is None else pretrain_embeddings.shape[1]
        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0)
        if pretrain_embeddings is not None:
            self.embedding.weight.data = torch.FloatTensor(pretrain_embeddings)
            self.embedding.requires_grad_(False)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim // 2, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(lstm_dim, lstm_dim, bidirectional=True, batch_first=True)
        self.pooler = lambda x: torch.mean(x, dim=1)
        self.dropout = nn.Dropout(0.3)
        self.classification_head = nn.Linear(lstm_dim * 2, 1)
    
    def forward(self, x):
        embed = self.embedding(x)
        h, _ = self.lstm(embed)
        h, _ = self.gru(h)
        feature = self.pooler(h)
        feature = self.dropout(feature)
        logits = self.classification_head(feature)
        return logits

In [5]:
def train_one_epoch(model, train_loader, optimizer, scheduler, criterion, epoch):
    model.train()
    progress_bar = tqdm(train_loader, total=len(train_loader))
    optimizer.zero_grad()
    batch_losses = AverageMeter()
    batch_score = CumsumMeter(metrics.f1_score)
    progress_bar.set_description(f"epoch {epoch}")
    for x, y in progress_bar:
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        batch_losses.update(loss.item(), n=y.shape[0])
        batch_score.update(y.detach().cpu().numpy(), (logits.detach().sigmoid().cpu().numpy() > 0.42966).astype(int))
        progress_bar.set_postfix({"loss": batch_losses.avg, "f1": batch_score.score})
        optimizer.zero_grad()
    return batch_losses.avg, batch_score.score


@torch.no_grad()
def evaluate(model, val_loader, criterion):
    model.eval()
    batch_losses = AverageMeter()
    batch_score = CumsumMeter(metrics.f1_score)
    progress_bar = tqdm(val_loader, total=len(val_loader))
    for x, y in progress_bar:
        logits = model(x)
        loss = criterion(logits, y)
        batch_losses.update(loss.item(), n=y.shape[0])
        batch_score.update(y.detach().cpu().numpy(), (logits.detach().sigmoid().cpu().numpy() > 0.42966).astype(int))
        progress_bar.set_postfix({"loss": batch_losses.avg, "f1": batch_score.score})
    return batch_losses.avg, batch_score.score

In [6]:
df = pd.read_csv("./train.csv")
print(df.target.value_counts(normalize=True))
df.head(3)

0    0.57034
1    0.42966
Name: target, dtype: float64


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [7]:
tokenizer = Tokenizer()
tokenizer.fit(df.text.to_list())

In [8]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df.target)
print(df_train.target.value_counts(normalize=True))
print(df_val.target.value_counts(normalize=True))

0    0.570279
1    0.429721
Name: target, dtype: float64
0    0.570584
1    0.429416
Name: target, dtype: float64


In [9]:
train_set = DisasterDataset(df_train, tokenizer=tokenizer)
val_set = DisasterDataset(df_val, tokenizer=tokenizer)

train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=32 * 2, shuffle=True)

In [17]:
epochs = 10
learning_rate = 3e-4
total_steps = len(train_loader) * epochs
embeddings = load_embeddings("/Users/heyao/learn_from_datasets/embedding/glove.6B.300d.txt", tokenizer.word2id)
model = LSTMModel(len(tokenizer), embedding_dim=128, lstm_dim=128, pretrain_embeddings=embeddings)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.BCEWithLogitsLoss()

In [18]:
# 0.697
# 0.706
for epoch in range(epochs):
    train_one_epoch(model, train_loader, optimizer, scheduler, criterion, epoch)
    _ = evaluate(model, val_loader, criterion)

epoch 0: 100%|█████████████████████████████████████████████████████████████████████████| 191/191 [00:08<00:00, 21.58it/s, loss=0.611, f1=0.605]
100%|████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 31.04it/s, loss=0.533, f1=0.669]
epoch 1: 100%|█████████████████████████████████████████████████████████████████████████| 191/191 [00:09<00:00, 21.11it/s, loss=0.528, f1=0.671]
100%|████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 29.67it/s, loss=0.517, f1=0.689]
epoch 2: 100%|██████████████████████████████████████████████████████████████████████████| 191/191 [00:08<00:00, 21.29it/s, loss=0.505, f1=0.69]
100%|████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 30.40it/s, loss=0.517, f1=0.701]
epoch 3: 100%|█████████████████████████████████████████████████████████████████████████| 191/191 [00:09<00:00, 20.78it/s, loss=0.494, f1

In [12]:
embeddings[embeddings.sum(axis=1) != 0].shape

(7547, 200)