## Import Libraries 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import copy
import functools
from cachetools import cached, TTLCache
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Read the train and test files 

In [3]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


### Split df_train into train and test split

In [5]:
from sklearn.model_selection import train_test_split

train_split, test_split = train_test_split(df_train[['text', 'target']], test_size = 0.25, stratify = df_train['target'], random_state = 42)

In [6]:
train_split.reset_index(inplace = True)
test_split.reset_index(inplace = True)

In [7]:
# Compute the class weigths since its an imbalanced dataset
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight = 'balanced', classes = [0, 1], y = df_train['target'])
class_weights = torch.tensor(class_weights, dtype = torch.float)
class_weights

tensor([0.8767, 1.1637])

### Load 'bert-base-uncased' model and tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Output of tokenizer - 'input_ids', 'token_type_ids' and 'attention_mask'
text = train_split['text'][0]
text = tokenizer(text, padding = 'max_length', max_length = 100, truncation = True, return_tensors = 'pt')
print(text)

{'input_ids': tensor([[  101,  7867,  3764,  2015, 16360,  2005,  1996,  9680,  6591,  1012,
         11693,  2000, 11234,  1996,  2060,  2305,  2043,  7171,  8369,  2041,
          2012, 28076,  2347,  1005,  1056,  1037,  2204,  4135,  6559,  1012,
          2770,  1999,  6634,  2053,  4569,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0

### Create PyTorch dataset to return the model inputs and targets 

In [10]:
cache = TTLCache(maxsize = 8000, ttl = 86400)

class TwitterDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    @cached(cache)
    def __getitem__(self, idx):
        text = self.df['text'][idx].lower()
        #text = ' '.join(text.split())
        target = self.df['target'][idx]
        
        text = tokenizer(text, padding = 'max_length', max_length = 100, truncation = True, return_tensors = 'pt')
        ids = text['input_ids'].squeeze(0)
        token_type_ids = text['token_type_ids'].squeeze(0)
        masks = text['attention_mask'].squeeze(0)
        
        return {
            'ids': ids,
            'token_type_ids': token_type_ids,
            'mask': masks,
            'targets': torch.tensor(target, dtype = torch.long)
        }

In [11]:
# Create train and val datasets
train_set = TwitterDataset(train_split, tokenizer)
test_set = TwitterDataset(test_split, tokenizer)

In [12]:
# Create train and val dataloaders
train_dataloader = DataLoader(train_set, batch_size = 8, shuffle = True, num_workers = 0)
val_dataloader = DataLoader(test_set, batch_size = 8, shuffle = False, num_workers = 0)

### Create BERT model class by subclassing nn.Module

In [13]:
class BERTModel(nn.Module):
    def __init__(self, model):
        super(BERTModel, self).__init__()
        self.model = model
        self.fc = nn.Linear(768, 2)
        
    def forward(self, ids, token_type_ids, masks):
        out = self.model(ids, token_type_ids, masks)[1]
        out = self.fc(out)
        return out

In [14]:
BERT = BERTModel(model)

### Loss, Optimizer and Learning rate scheduler  

In [15]:
criterion = nn.CrossEntropyLoss(weight = class_weights.to(device))

optimizer = optim.AdamW(BERT.parameters(), lr = 3e-5)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.1, patience = 3, mode = 'min', verbose = True)

### Model Training 

In [16]:
# Function to train on train split and validate on val split

def train_model(model, criterion, optimizer, scheduler, num_epochs):
    
    train_losses, val_losses = [], []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_f1 = 0

    for epoch in range(1, num_epochs + 1):

        print(f'Epoch {epoch}/{num_epochs}')

        train_loss, val_loss = [], []
        y_preds, y_true = [], []

        # Train
        train_loop = tqdm(train_dataloader, total = len(train_dataloader), position = 0, leave = True)

        model.train()
        for data in train_loop:
            ids = data['ids'].to(device)
            masks = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            optimizer.zero_grad()

            output = model(ids, token_type_ids, masks)
            preds = torch.argmax(output, dim = 1).tolist()
            loss = criterion(output, targets)

            train_loss.append(loss.item())
            y_preds.extend(preds)
            y_true.extend(targets.detach().cpu().tolist())

            loss.backward()
            optimizer.step()

            train_loop.set_postfix(train_loss = np.mean(train_loss), train_f1 = f1_score(y_preds, y_true))

        train_loss = np.mean(train_loss)

        train_losses.append(train_loss)

        # Validation
        val_loop = tqdm(val_dataloader, total = len(val_dataloader), position = 0, leave = True)

        y_preds, y_true = [], []

        model.eval()
        with torch.no_grad():
            for data in val_loop:
                ids = data['ids'].to(device)
                masks = data['mask'].to(device)
                token_type_ids = data['token_type_ids'].to(device)
                targets = data['targets'].to(device)

                output = model(ids, token_type_ids, masks)
                preds = torch.argmax(output, dim = 1).tolist()
                loss = criterion(output, targets)

                val_loss.append(loss.item())
                y_preds.extend(preds)
                y_true.extend(targets.detach().cpu().tolist())

                val_loop.set_postfix(val_loss = np.mean(val_loss), val_f1 = f1_score(y_preds, y_true))

            val_loss = np.mean(val_loss)
            val_f1 = f1_score(y_preds, y_true)

            val_losses.append(val_loss)

            scheduler.step(val_loss)

        # Save weights of model having best validation metric
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_wts = copy.deepcopy(model.state_dict())
        print('------------------------------------------------------------------\n')

    print(f'Training complete, Best F1 score : {best_f1 * 100:.2f}%')
    model.load_state_dict(best_model_wts)

    return model, train_losses, val_losses

In [17]:
# With 3 epochs we get highest F1-score of 81.96% on validation dataset
model, train_losses, val_losses = train_model(BERT.to(device), criterion, optimizer, scheduler, 3)

Epoch 1/3


100%|██████████████████████████████████████████████| 714/714 [05:49<00:00,  2.04it/s, train_f1=0.757, train_loss=0.457]
100%|███████████████████████████████████████████████████| 238/238 [00:35<00:00,  6.62it/s, val_f1=0.82, val_loss=0.391]


------------------------------------------------------------------

Epoch 2/3


100%|██████████████████████████████████████████████| 714/714 [05:48<00:00,  2.05it/s, train_f1=0.847, train_loss=0.328]
100%|██████████████████████████████████████████████████| 238/238 [00:34<00:00,  6.89it/s, val_f1=0.804, val_loss=0.408]


------------------------------------------------------------------

Epoch 3/3


100%|███████████████████████████████████████████████| 714/714 [05:48<00:00,  2.05it/s, train_f1=0.91, train_loss=0.217]
100%|██████████████████████████████████████████████████| 238/238 [00:35<00:00,  6.79it/s, val_f1=0.805, val_loss=0.485]

------------------------------------------------------------------

Training complete, Best F1 score : 81.96%



