In [None]:
# !pip install transformers
# wget https://github.com/t-davidson/hate-speech-and-offensive-language/raw/master/data/labeled_data.csv
# wget https://github.com/brianbt/AI6127_NLP_project/raw/master/data/labeled_data_spell.csv
# Put me under data/

In [None]:
import nltk

!pip install pyenchant
!wget http://archive.ubuntu.com/ubuntu/pool/main/libr/libreoffice-dictionaries/hunspell-id_6.4.3-1_all.deb
!dpkg -i hunspell-id_6.4.3-1_all.deb
!apt update && apt install -y enchant libenchant1c2a hunspell hunspell-en-us libhunspell-1.6-0
nltk.download('wordnet')
!apt-get install libenchant1c2a -y
!pip install contractions

import contractions
import enchant
from enchant.checker import SpellChecker

In [1]:
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
# from tqdm import tqdm
from tqdm.notebook import tqdm
import pandas as pd
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoModelForSequenceClassification,AutoModel
from transformers import AutoTokenizer, AutoConfig
from sklearn.utils import shuffle
import sklearn
import random
import warnings
import re
# from math import comb

seed = 888
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device}")

# Utils

In [2]:
def preprocess(string):
    temp = string.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("[!:]+","", temp)
    temp = re.sub(r"&amp;", "", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub(r"www.\S+", "", temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = re.sub(r"rt", "", temp)
    return temp

def misspellings(string):
    d = enchant.request_dict("en_US")
    chkr = SpellChecker("en_US", string)
    for err in chkr:
        suggest = d.suggest(err.word)
        if len(suggest) != 0:
            err.replace(suggest[0])
    
    return chkr.get_text()#print(chkr.get_text())

def dup_data(df, on_class, repeat=1000):
    """ data augmentation
    
    This will random pick two row and mix them together
    So two sentence will be concat tgt, generate a longer sentence
    For safety, it only do augmentation on the same class
      It will NOT generate new data nest-ed-ly. 
      Means augmented data will not be used to generate new data
    
    Args:
        df: DataFrame
        on_class (int): Which class to augment, either [0,1,2]
        repeat (int): how many new data to generate
    
    Return:
        pd.DataFrame: The augmented data and org df will be concat tgt
        
    Examples:
        >>> df = dup_data(df, 0)
    """
    cl = {'hate_speech': 0, 'offensive_language': 1, 'neither': 2}
    out = []
    tmp = df[df['class'] == on_class]
#     print(f"class={on_class} have {tmp.shape[0]} data, each time pick 2. nCr = {comb(tmp.shape[0], 2)}")
    for k in range(repeat):
        i,j = random.randint(0, tmp.shape[0]-1), random.randint(0, tmp.shape[0]-1)
        out.append(tmp.iloc[i]+tmp.iloc[j])
    out = pd.concat(out, axis=1).T
    out[['count', 'hate_speech', 'offensive_language', 'neither', 'class']] = out[['count', 'hate_speech', 'offensive_language', 'neither', 'class']].astype('int')
    # handle edge case
    out['class'] = out[['hate_speech', 'offensive_language', 'neither']].idxmax(1).map({'hate_speech': 0, 'offensive_language': 1, 'neither': 2})
    return pd.concat([df,out]).reset_index(drop=True)

def number_params(model, exclude_freeze=False):
    """calculate the number of parameters in a model

    Args:
        model (nn.Module): PyTorch model
        exclude_freeze (bool, optional): Whether to count the frozen layer. Defaults to False.
    """
    pp = 0
    for p in list(model.parameters()):
        if exclude_freeze and p.requires_grad is False:
            continue
        nn = 1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

def ensemble(models, do_argmax=False):
    """ ensemble models prediction
    
    Args:
        models (list(tensor)): list of prediction, each prediction should have same shape(N,C).
    
    Examples:
        tweet = iter(dataloader_train).next()[-1]
        out = model(list(tweet))
        ensemble([out[0], out[0]])
    """
    out = torch.softmax(models[0], 1)
    for i in range(1, len(models)):
        out += torch.softmax(models[i], 1)
    if do_argmax:
        return out.argmax(1)
    else:
        return out

def finetune(
        model: nn.Module,
        base_lr: float,
        groups,
        ignore_the_rest: bool = False,
        raw_query: bool = False,
        regex=False):
    """ This is something call per-parameter options

    Separate out the finetune parameters with a learning rate for each layers of parameters
    This function only support setting a different learning rate for each layer's arameter.
    Depending on the optimizer, you can set extra parameter for that layer for the optmizer -> See Notes 
    If you freeze layer using this function and want to unfreeze it later:
    See https://discuss.pytorch.org/t/correct-way-to-freeze-layers/26714/2

    Args:
        model (nn.Module): Pytorch Model
        base_lr (float): learning rate of all layers
        groups (Dict[str, float]): key is `name` of layers, value is the `extra_lr` (or False).
          all layers that contains that `name` will have `lr` of base_lr*extra_lr.
          it uses fnmatch|regex to check whether a layer contains that `name`.
          fnmatch is matching structure like `layer1*`, `layer?.conv?.`, `*conv2*`, etc...
          regex is the comman regex matching.
          Hence, `name` here is either fnmatch or regex expression if using raw_query.
          If `float` is False: those layers with `name` will be freeze. 
          In particular, they will not be included in the return output and require_grad will be set to False
        ignore_the_rest (bool, optional): Include the remaining layer that are not stated in `grouprs` or not. Defaults to False.
        raw_query (bool, optional): Modify the keys of `groups` as f'*{key}*' if False. Only useful when `regex=False`
          Do not do any modification to the keys of `groups` if True. Defaults to False.
        regex (bool, optional): Use regex instead of fnmatch on keys of groups. Defaults to False.
          This will overrride raw_query to True. 
          Notice: `regex=False` is depracted

    Returns:
        List[Dict[str, Union[float, Iterable]]]: list of dict that has two or more key-value pair.
          The first one is feature generation layers. [those layers must start with `features` name] <usually is backbone>
            is a dict['params':list(model.parameters()), 'names':list(`layer's name`), 'query':query, 'lr':base_lr*groups[groups.keys()]]
          The remaining are all others layer. [all others params for last one, if ignore_the_rest = False]
            is a dict['params':list(model.parameters()), 'names':list(`layer's name`), 'lr':base_lr]

    Examples:
        >>> model = models.resnet50()
        >>> # all layers that has name start with `layer1 and layer2` will have learning rate `0.001*0.01`
        >>> # all layers that has name start with `layer3` will be froozen`
        >>> # all layers that has name start with `layer4` will have learning rate `0.001*0.001`
        >>> # for all other layers will have the base_lr `0.001`
        >>> model_params = finetune(model, base_lr=0.001, groups={'^layer[1-2].*': 0.01, '^layer3.*': False, '^layer4.*': 0.001}, regex=True)
        >>> # setting extra parameter (other than learning rate) for that optimizer
        >>> # the second param_group `layer4` will have weight_decay 1e-2
        >>> model_params[1]['weight_decay'] = 1e-2
        >>> # init optimizer with the above setting
        >>> # the argument under `torch.optim.SGD` will be overrided by finetune() if they exist.
        >>> # For example, all model_params will have weight_decay=5e-3 except model_params[1]
        >>> optimizer = torch.optim.SGD(model_params, momentum=0.9, lr=0.1, weight_decay=5e-3)
    """
    if regex:
        raw_query = True
    else:
        warnings.warn("regex=False is deprecated; use regex=True", DeprecationWarning)
    # Deal with Freeze Later
    freeze_group = dict()
    freeze = False
    for k,v in groups.items():
        if v is False:
            freeze_group[k] = 1
            freeze=True
    for k in freeze_group.keys():
        del groups[k]
    freeze_group = "(" + ")|(".join(freeze_group) + ")"

    parameters = [
        dict(params=[],
             names=[],
             query=query if raw_query else '*' + query + '*',
             lr = lr * base_lr,
             initial_lr = lr * base_lr) for query, lr in groups.items()
    ]
    rest_parameters = dict(params=[], names=[], lr=base_lr, initial_lr=base_lr)
    for k, v in model.named_parameters():
        rest = 0
        if freeze and regex and re.match(freeze_group, k):
            v.requires_grad = False
            continue
        for group in parameters:
            if not regex and fnmatch(k, group['query']):
                group['params'].append(v)
                group['names'].append(k)
                rest = 1
                break
            elif regex and re.compile(group['query']).search(k):
                group['params'].append(v)
                group['names'].append(k)
                rest = 1
                break
        if rest == 0:
            rest_parameters['params'].append(v)
            rest_parameters['names'].append(k)

    if not ignore_the_rest:
        parameters.append(rest_parameters)
    for group in parameters:
        group['params'] = iter(group['params'])
    return parameters

# Load Data

In [3]:
# path of data and the name of pretrained weights
# path = '../input/AI6127/labeled_data.csv'
# path = '../input/labeled-data-spell/labeled_data_spell.csv'
# path = './data/labeled_data.csv'
path = '../input/hate-speech-and-offensive-language-dataset/labeled_data.csv'

In [4]:
df = pd.read_csv(path, index_col = 0).dropna()
# df["tweet"] = df["tweet"].apply(fix_contractions)
# df["tweet"] = df["tweet"].apply(preprocess)
# df["tweet"] = df["tweet"].apply(misspellings)
df = shuffle(df)
df.describe()

In [5]:
# 16307 & 5009, Just to make sure the experiment is reproducible
df.iloc[0:2] 

In [6]:
df['class'].value_counts()

In [7]:
# split data into train and test
train_data = df.sample(frac = 0.8)
test_data = df.drop(train_data.index)
valid_data = test_data.sample(frac = 0.5)
test_data = test_data.drop(valid_data.index)

display(train_data.head())
print("===================================")
display(valid_data.head())
print("===================================")
display(test_data.head())

print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

In [8]:
train_data['class'].value_counts()

## Augmentation (experiments)

In [None]:
# train_data = dup_data(train_data, 0, 15361-1129)
# train_data = dup_data(train_data, 2, 15361-3336)

In [None]:
# train_data['class'].value_counts()

# Build Dataset

The data are stored as a CSV and as a pickled pandas dataframe (Python 2.7). Each data df contains 5 columns:

count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

hate_speech = number of CF users who judged the tweet to be hate speech.

offensive_language = number of CF users who judged the tweet to be offensive.

neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

class = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, model_name=None,  train = True, device='cuda'):
        super(Dataset, self).__init__()
        self.df = df
        self.device = device
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        device = self.device
        count = torch.LongTensor([self.df.iloc[idx]['count']])
        hate_speech = torch.LongTensor([self.df.iloc[idx]['hate_speech']])
        offensive_language = torch.LongTensor([self.df.iloc[idx]['offensive_language']])
        neither = torch.LongTensor([self.df.iloc[idx]['neither']])
        target = torch.LongTensor([self.df.iloc[idx]['class']])
        tweet = self.df.iloc[idx]['tweet']
        return (count.to(device), hate_speech.to(device), offensive_language.to(device)
                , neither.to(device), target.to(device), tweet)
        
        

In [10]:
# This one is just for DEBUG, not the real dataset to be used
dataset = Dataset(df, device=device)
dataloader = torch.utils.data.DataLoader(dataset, batch_size= 6, shuffle=True)
output = next(iter(dataloader))
print(output)

# Model

In [11]:
class LanguageModel(nn.Module):
    
    def __init__(self, model_name, num_labels = 3, freeze_pretrained=False):
        super(LanguageModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if freeze_pretrained:
            print("You are freezing the BERT")
            for name, p in self.model.named_parameters():
                if 'classifier' not in name:
                    p.requires_grad = False
        print(f"Total number of params: {number_params(self.model)}")
        print(f"Total number of trainable params: {number_params(self.model, exclude_freeze=True)}")

    def forward(self, src, has_mask=False):
        # print(src)
        output = self.tokenizer(src, padding=True, truncation=True, max_length=50)
        output = torch.LongTensor(output['input_ids']).to(device)
        if has_mask == True:
            attention_mask=(output != 0).float() # here `0` is the <pad> token, i guess
            output = self.model(output, attention_mask=attention_mask)
        else:
            output = self.model(output)
        return output

In [12]:
class LastAttnModel(nn.Module):
    """
    Use the [CLS] as query and all other output as key and values.
    Pass it to a Multi-Head Attention, then a Linear classifier
    
    Args:
      auxiliary_head(list(int)): Only used when training
        - list of idx of hidden_layers that will be used as auxiliary_head. Here `idx` start from 1
        - See BertConfig['num_hidden_layers'] for total number of layers
        - EG: `auxiliary_head=[10,11,12]`.
      last_hidden_layer(int): Treat the output of this layer as last_hidden_layer
      all_CLS_attn(bool): use last CLS as query, all previous CLS as key and values -> Multi-Head Attention

    Examples:
      tweet = iter(dataloader_train).next()[-1]
      modelA = LastAttnModel(pretrain_model, tokenizer).to(device)
      out = modelA(list(tweet))
      print(out[0].shape, attn_weight[1].shape) #torch.Size([32, 3]) torch.Size([32, 1, 49])
      
    Returns:
      list(tensor): the first tensor is the prediction, the second is the attention weight
    """
    
    def __init__(self, pretrain_model, tokenizer, 
                 last_attn_num_head = 8,
                 classifier_hidden_dim = 512, 
                 classifier_dropout = 0,
                 num_labels = 3, 
                 freeze_pretrained=False,
                 auxiliary_head=None,
                 last_hidden_layer=-1,
                 all_CLS_attn=False,
                 **kwargs):
        super(LastAttnModel, self).__init__()
        self.pretrain_model = pretrain_model
        self.tokenizer = tokenizer
        self.auxiliary_head = auxiliary_head
        self.num_layers = len(pretrain_model.encoder.layer)
        self.last_hidden_layer = last_hidden_layer
        self.all_CLS_attn = all_CLS_attn
        
        if freeze_pretrained:
            if self.auxiliary_head is not None:
              warnings.warn("freeze_pretrained and auxiliary_head set to True together is useless for training. Consider use `finetune()`")
            print("You are freezing the BERT pertrain")
            for name, p in self.pretrain_model.named_parameters():
                if 'classifier' not in name:
                    p.requires_grad = False
        
        embed_size = pretrain_model.embeddings.word_embeddings.embedding_dim
        self.last_attn = nn.MultiheadAttention(embed_size, last_attn_num_head, batch_first=True)
        self.final_classifier = nn.Sequential(
            nn.Linear(embed_size, classifier_hidden_dim),
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(classifier_hidden_dim, num_labels)
        )

        # deal with aux 
        if self.auxiliary_head is not None:
          self.aux_classifiers = nn.ModuleList()
          for i in self.auxiliary_head:
            self.aux_classifiers.append(nn.Sequential(
              nn.Linear(embed_size, classifier_hidden_dim),
              nn.ReLU(),
              nn.Dropout(classifier_dropout),
              nn.Linear(classifier_hidden_dim, num_labels)
            ))
            
        # use all CLS attention
        if self.all_CLS_attn:
            self.all_CLS = nn.MultiheadAttention(embed_size, last_attn_num_head, batch_first=True)

        print(f"Total number of params: {number_params(self)}")
        print(f"Total number of trainable params: {number_params(self, exclude_freeze=True)}")
    def forward(self, src, has_mask=False, count=None):
        # print(src)
        out = []
        tokens = self.tokenizer(src, padding=True, truncation=True, max_length=50)
        inputs = torch.LongTensor(tokens['input_ids']).to(device)
        if has_mask == True:
            # attention_mask=(inputs != 0).float() # here `0` is the <pad> token, i guess
            attention_mask = torch.LongTensor(tokens['attention_mask']).to(device)
            pre_train_output = self.pretrain_model(inputs, attention_mask=attention_mask)
        else:
            pre_train_output = self.pretrain_model(inputs)
        # last_hidden = pre_train_output["last_hidden_state"]      # (N,T,E)
        last_hidden = pre_train_output["hidden_states"][self.last_hidden_layer]
        last_hidden_state_cls = last_hidden[:, 0, :].unsqueeze(1)   # (N,1,E)
        if self.all_CLS_attn:
            o = [hidden[:,0,:] for hidden in pre_train_output["hidden_states"][:self.last_hidden_layer-1]]
            rest = torch.stack(o).permute(1,0,2)  # (N,self.last_hidden_layer-1,E)
            last_hidden_state_cls,_=self.all_CLS(last_hidden_state_cls, rest, rest)
        last_hidden_state_rest = last_hidden[:, 1:, :] # (N,T-1,E)
        atten_mask_pad = (inputs == 0)[:,1:] #(N,T-1)
        last_attn_out, last_attn_w = self.last_attn(last_hidden_state_cls, last_hidden_state_rest, last_hidden_state_rest,
                                                    key_padding_mask=atten_mask_pad) #(N,1,E), (N,1,T-1)
        last_attn_out = last_attn_out.squeeze(1) #(N,E)
        output = self.final_classifier(last_attn_out)
        out += [output, last_attn_w]
        ## auxiliary_head
        if self.auxiliary_head is not None:
          if "hidden_states" not in pre_train_output:
            raise Exception("Put `pre_train_output=True` in AutoConfig")
          for idx in range(len(self.aux_classifiers)):
            hidden_cls = pre_train_output["hidden_states"][self.auxiliary_head[idx-1]][:, 0, :] # (N,E)
            out.append(self.aux_classifiers[idx](hidden_cls))
        return out
    
# ## Usage
# tweet = iter(dataloader_train).next()[-1]
# modelA = LastAttnModel(pretrain_model, tokenizer).to(device)
# out, attn_weight = modelA(list(tweet))
# print(out.shape, attn_weight.shape)

In [13]:
class BiLSTMModel(nn.Module):
    """
    Fit the last layer BERT output to bi-lstm.
    Concat the forward and backward final hidden state, then a Linear classifier
    
    Args:
      last_hidden_layer(int): Treat the output of this layer as last_hidden_layer

    Examples:
      tweet = iter(dataloader_train).next()[-1]
      modelA = BiLSTMModel(pretrain_model, tokenizer).to(device)
      out  = modelA(list(tweet))
      print(out[0].shape) #torch.Size([32, 3])
      
    Returns:
      list(tensor): the first tensor is the prediction, the second is the attention weight
    """
    
    def __init__(self, pretrain_model, tokenizer, 
                 lstm_hidden = 1024,
                 lstm_num_layer = 2,
                 classifier_hidden_dim = 512, 
                 classifier_dropout = 0,
                 num_labels = 3, 
                 freeze_pretrained=False,
                 last_hidden_layer=-1,
                 **kwargs):
        super(BiLSTMModel, self).__init__()
        self.pretrain_model = pretrain_model
        self.tokenizer = tokenizer
        self.lstm_hidden = lstm_hidden
        self.lstm_num_layer = lstm_num_layer
        self.num_layers = len(pretrain_model.encoder.layer)
        self.last_hidden_layer = last_hidden_layer
        
        if freeze_pretrained:
            print("You are freezing the BERT pertrain")
            for name, p in self.pretrain_model.named_parameters():
                if 'classifier' not in name:
                    p.requires_grad = False
        
        embed_size = pretrain_model.embeddings.word_embeddings.embedding_dim
        self.lstm = nn.LSTM(embed_size, lstm_hidden, lstm_num_layer, bidirectional=True, batch_first=True)
        
        self.final_classifier = nn.Sequential(
            nn.Linear(2*lstm_hidden, classifier_hidden_dim),
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(classifier_hidden_dim, num_labels)
        )

        print(f"Total number of params: {number_params(self)}")
        print(f"Total number of trainable params: {number_params(self, exclude_freeze=True)}")
    def forward(self, src, has_mask=False, count=None):
        # print(src)
        out = []
        tokens = self.tokenizer(src, padding=True, truncation=True, max_length=50)
        inputs = torch.LongTensor(tokens['input_ids']).to(device)
        if has_mask == True:
            # attention_mask=(inputs != 0).float() # here `0` is the <pad> token, i guess
            attention_mask = torch.LongTensor(tokens['attention_mask']).to(device)
            pre_train_output = self.pretrain_model(inputs, attention_mask=attention_mask)
        else:
            pre_train_output = self.pretrain_model(inputs)
        # last_hidden = pre_train_output["last_hidden_state"]      # (N,T,E)
        last_hidden = pre_train_output["hidden_states"][self.last_hidden_layer]
        batch_size, seq_len, embed_size = last_hidden.shape
        output, (h_n, c_n) = self.lstm(last_hidden)
        output = output.view(batch_size, seq_len, 2, self.lstm_hidden) #batch, seq_len, num_directions, hidden_size
        h_n = h_n.view(self.lstm_num_layer, 2, batch_size, self.lstm_hidden) # num_layers, num_directions, batch, hidden_size
        c_n = c_n.view(self.lstm_num_layer, 2, batch_size, self.lstm_hidden) # num_layers, num_directions, batch, hidden_size
        forward_last = h_n[-1, 0, :, :]  #(N, H)
        backward_last = h_n[-1, 1, :, :] #(N, H)
        output = torch.hstack([forward_last, backward_last]) #(N,2H)
        output = self.final_classifier(output)
        out += [output]
        return out
    
# ## Usage
# tweet = iter(dataloader_train).next()[-1]
# modelA = LastAttnModel(pretrain_model, tokenizer).to(device)
# out = modelA(list(tweet))
# print(out[0].shape)

In [18]:
class EnsembleBertLastAttnModel(nn.Module):
    """
    Use the [CLS] as query and all other output as key and values.
    Pass it to a Multi-Head Attention, then a Linear classifier
    
    Args:
      auxiliary_head(list(int)): Only used when training
        - list of idx of hidden_layers that will be used as auxiliary_head. Here `idx` start from 1
        - See BertConfig['num_hidden_layers'] for total number of layers
        - EG: `auxiliary_head=[10,11,12]`.
      last_hidden_layer(int): Treat the output of this layer as last_hidden_layer
      all_CLS_attn(bool): use last CLS as query, all previous CLS as key and values -> Multi-Head Attention

    Examples:
      tweet = iter(dataloader_train).next()[-1]
      modelA = LastAttnModel(pretrain_model, tokenizer).to(device)
      out = modelA(list(tweet))
      print(out[0].shape, attn_weight[1].shape) #torch.Size([32, 3]) torch.Size([32, 1, 49])
      
    Returns:
      list(tensor): the first tensor is the prediction, the second is the attention weight
    """
    
    def __init__(self, pretrain_models, tokenizer, 
             last_attn_num_head = 8,
             classifier_hidden_dim = 512, 
             classifier_dropout = 0,
             num_labels = 3, 
             freeze_pretrained=False,
             auxiliary_head=None,
             last_hidden_layer=-1,
             all_CLS_attn=False,
             **kwargs):
        super(EnsembleBertLastAttnModel, self).__init__()
        self.pretrain_models = pretrain_models
        self.tokenizer = tokenizer
        self.auxiliary_head = auxiliary_head
        self.num_layers = len(pretrain_models[0].encoder.layer)
        self.last_hidden_layer = last_hidden_layer
        self.all_CLS_attn = all_CLS_attn
        
        if freeze_pretrained:
            if self.auxiliary_head is not None:
              warnings.warn("freeze_pretrained and auxiliary_head set to True together is useless for training. Consider use `finetune()`")
            print("You are freezing the BERT pertrain")
            for pretrain_model in self.pretrain_models:
                for name, p in pretrain_model.named_parameters():
                    if 'classifier' not in name:
                        p.requires_grad = False
        
        embed_size = pretrain_models[0].embeddings.word_embeddings.embedding_dim
        self.last_attn = nn.MultiheadAttention(embed_size, last_attn_num_head, batch_first=True)
        self.final_classifier = nn.Sequential(
            nn.Linear(embed_size, classifier_hidden_dim),
            nn.ReLU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(classifier_hidden_dim, num_labels)
        )

        # deal with aux 
        if self.auxiliary_head is not None:
          self.aux_classifiers = nn.ModuleList()
          for i in self.auxiliary_head:
            self.aux_classifiers.append(nn.Sequential(
              nn.Linear(embed_size, classifier_hidden_dim),
              nn.ReLU(),
              nn.Dropout(classifier_dropout),
              nn.Linear(classifier_hidden_dim, num_labels)
            ))
            
        # use all CLS attention
        if self.all_CLS_attn:
            self.all_CLS = nn.MultiheadAttention(embed_size, last_attn_num_head, batch_first=True)

        print(f"Total number of params: {number_params(self)}")
        print(f"Total number of trainable params: {number_params(self, exclude_freeze=True)}")
    def forward(self, src, has_mask=False, count=None):
        # print(src)
        out = []
        tokens = self.tokenizer(src, padding=True, truncation=True, max_length=50)
        inputs = torch.LongTensor(tokens['input_ids']).to(device)
        if has_mask == True:
            # attention_mask=(inputs != 0).float() # here `0` is the <pad> token, i guess
            attention_mask = torch.LongTensor(tokens['attention_mask']).to(device)

            pre_train_output = self.pretrain_models[0](inputs, attention_mask=attention_mask)
            last_hidden = pre_train_output["hidden_states"][self.last_hidden_layer]
            hidden_states = list(pre_train_output['hidden_states'])
            for i in range(1, len(self.pretrain_models)):
                pre_train_output = self.pretrain_models[i](inputs, attention_mask=attention_mask)
                last_hidden += pre_train_output["hidden_states"][self.last_hidden_layer]
                for j in range(len(hidden_states)):
                    hidden_states[j] += pre_train_output["hidden_states"][j]

        else:
            pre_train_output = self.pretrain_models[0](inputs)
            last_hidden = pre_train_output["hidden_states"][self.last_hidden_layer]
            hidden_states = list(pre_train_output['hidden_states'])
            for i in range(1, len(self.pretrain_models)):
                pre_train_output = self.pretrain_models[i](inputs)
                last_hidden += pre_train_output["hidden_states"][self.last_hidden_layer]
                for j in range(len(hidden_states)):
                    hidden_states[j] += pre_train_output["hidden_states"][j]
        
        last_hidden = last_hidden/len(self.pretrain_models)
        hidden_states = tuple(map(lambda hs: hs/len(self.pretrain_models), hidden_states))

        # last_hidden = pre_train_output["last_hidden_state"]      # (N,T,E)
        # last_hidden = pre_train_output["hidden_states"][self.last_hidden_layer]
        last_hidden_state_cls = last_hidden[:, 0, :].unsqueeze(1)   # (N,1,E)
        if self.all_CLS_attn:
            o = [hidden[:,0,:] for hidden in pre_train_output["hidden_states"][:self.last_hidden_layer-1]]
            rest = torch.stack(o).permute(1,0,2)  # (N,self.last_hidden_layer-1,E)
            last_hidden_state_cls,_=self.all_CLS(last_hidden_state_cls, rest, rest)
        last_hidden_state_rest = last_hidden[:, 1:, :] # (N,T-1,E)
        atten_mask_pad = (inputs == 0)[:,1:] #(N,T-1)
        last_attn_out, last_attn_w = self.last_attn(last_hidden_state_cls, last_hidden_state_rest, last_hidden_state_rest,
                                                    key_padding_mask=atten_mask_pad) #(N,1,E), (N,1,T-1)
        last_attn_out = last_attn_out.squeeze(1) #(N,E)
        output = self.final_classifier(last_attn_out)
        out += [output, last_attn_w]
        ## auxiliary_head
        if self.auxiliary_head is not None:
          if "hidden_states" not in pre_train_output:
            raise Exception("Put `pre_train_output=True` in AutoConfig")
          for idx in range(len(self.aux_classifiers)):
            hidden_cls = pre_train_output["hidden_states"][self.auxiliary_head[idx-1]][:, 0, :] # (N,E)
            out.append(self.aux_classifiers[idx](hidden_cls))
        return out
    
# ## Usage
# tweet = iter(dataloader_train).next()[-1]
# modelA = LastAttnModel(pretrain_model, tokenizer).to(device)
# out, attn_weight = modelA(list(tweet))
# print(out.shape, attn_weight.shape)

# Initialization

In [19]:
def get_bert(model_name):
    config = AutoConfig.from_pretrained(
        model_name, 
        output_hidden_states = True,
        output_attention = False,
        hidden_dropout_prob = 0.2,
    ) 
#     print(config)
    pretrain_model = AutoModel.from_pretrained(
        model_name,
        config = config
    ).to(device)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return pretrain_model, tokenizer

In [23]:
dataset = Dataset(train_data)
dataloader_train = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
dataset_valid = Dataset(valid_data)
dataloader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=32, shuffle=False)
dataset_test = Dataset(test_data)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=32, shuffle=False)

config = {'bert_list': ['GroNLP/hateBERT', 'diptanu/fBERT']}

lossfn = nn.CrossEntropyLoss().to(device)
hateBert, tokenizer = get_bert('GroNLP/hateBERT')
fBert, _ = get_bert('diptanu/fBERT')
pretrain_model = [hateBert, fBert]

Dconfig = {'Dmodel_name':'EnsembleBertLastAttn',
           'freeze_pretrained':True,
           'classifier_dropout':0.1,
           'auxiliary_head':None,
           'last_hidden_layer':5,
           'all_CLS_attn':True,
           'epochs': 20}
model = EnsembleBertLastAttnModel(pretrain_model, tokenizer, **Dconfig).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)

epochs = Dconfig['epochs']

save_path="./ensemblelastATNN.pt"

In [None]:
for datas in dataloader_train:
    count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
    break

In [None]:
model(list(tweet))[0].shape

## DEBUG

In [None]:
# model_name = 'GroNLP/hateBERT'
# config = AutoConfig.from_pretrained(
#     model_name, 
#     output_hidden_states = True,
#     output_attention = False
# ) 
# print(config)
# model = AutoModel.from_pretrained(
#     model_name,
#     config = config
# ).to(device)

# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# tweet = iter(dataloader_train).next()[-1]
# tokens = tokenizer(list(tweet), padding=True, truncation=True, max_length=50)
# print(tokens.keys())
# inputs = torch.LongTensor(tokens['input_ids']).to(device)
# atten_mask = torch.LongTensor(tokens['attention_mask']).to(device)
# print(inputs.shape)
# hiddens = model(inputs)
# print(hiddens.keys())
# print(hiddens['last_hidden_state'].shape)

In [None]:
# lstm = nn.LSTM(model.embeddings.word_embeddings.embedding_dim, 1024, 2, bidirectional=True, batch_first=True).cuda()
# batch_size, seq_len, embed_size = hiddens['last_hidden_state'].shape
# output, (h_n, c_n) = lstm(hiddens['last_hidden_state'])
# print(output.shape, h_n.shape, c_n.shape)
# output = output.view(batch_size, seq_len, 2, 1024) #batch, seq_len, num_directions, hidden_size
# h_n = h_n.view(2, 2, batch_size, 1024) # num_layers, num_directions, batch, hidden_size
# c_n = c_n.view(2, 2, batch_size, 1024) # num_layers, num_directions, batch, hidden_size
# print(output.shape, h_n.shape, c_n.shape)
# forward_last = h_n[-1, 0, :, :]
# backward_last = h_n[-1, 1, :, :]

In [None]:
# hiddens['pooler_output'].shape
# hiddens['last_hidden_state'][:,0,:]

In [None]:
# model.pooler

In [None]:
# model.pooler(hiddens['last_hidden_state']) == hiddens['pooler_output']

In [None]:
# atten_mask_pad = (inputs == 0)
# print(atten_mask_pad.shape)
# print(atten_mask_pad[:,1:].shape)

In [None]:
# ensemble([out[0], torch.randn((32,3)).cuda(), out[0]-1])

# Train

In [15]:
def train(dataloader_train, dataloader_valid = None, model = None, 
          optimizer = None, lossfn = None,  epochs = 10, has_mask = True):
    
    trainloss = []
    validloss = []
    trainscore = []
    validscore = []
    bestt_score = 0
    for i in range(epochs):
        model.train()
        averageloss = 0
        averagef1 = 0
        averagePrecision = 0
        averageRecall = 0
        for datas in tqdm(dataloader_train):
            count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
            target = target.to('cuda')
            optimizer.zero_grad()
            pred = model(list(tweet), has_mask)
            loss = lossfn(pred[0], target.squeeze(1))
            # aux head
            for j in range(2, len(pred)):
                loss+=0.3*lossfn(pred[j], target.squeeze(1))
            loss.backward()
            optimizer.step()
            f1score = sklearn.metrics.f1_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average = 'weighted')
            precision=sklearn.metrics.precision_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average='weighted', zero_division=0)
            recall=sklearn.metrics.recall_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average='weighted', zero_division=0)
            averageloss += loss.item()/len(dataloader_train)
            averagef1 += f1score/len(dataloader_train)
            averagePrecision += precision/len(dataloader_train)
            averageRecall += recall/len(dataloader_train)
        trainloss.append(averageloss)
        trainscore.append((averagef1, averagePrecision, averageRecall))
        if dataloader_valid is not None:
            model.eval()
            averageloss = 0
            averagef1 = 0
            averagePrecision = 0
            averageRecall = 0
            for datas in tqdm(dataloader_valid):
                count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
                target = target.to('cuda')
                pred = model(list(tweet), has_mask)
                loss = lossfn(pred[0], target.squeeze(1))
                f1score = sklearn.metrics.f1_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average = 'weighted')
                precision=sklearn.metrics.precision_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average='weighted', zero_division=0)
                recall=sklearn.metrics.recall_score(target.squeeze(1).cpu().numpy(), pred[0].argmax(-1).cpu().numpy(), average='weighted', zero_division=0)
                averageloss += loss.item()/len(dataloader_valid)
                averagef1 += f1score/len(dataloader_valid)
                averagePrecision += precision/len(dataloader_valid)
                averageRecall += recall/len(dataloader_valid)
            validloss.append(averageloss)
            validscore.append((averagef1, averagePrecision, averageRecall))
            print(f"epoch: {i}, train loss: {trainloss[-1]}, validation loss: {validloss[-1]}\n train f1score: {trainscore[-1]}\nvalidation f1score: {validscore[-1]}")
            if averagef1 > bestt_score:
                print("Found Best Model")
                to_save = {'model': model.state_dict(),
                           'config': config,
                           'Dconfig': Dconfig,
                           'optimizer': optimizer,
                           'lr_s':None}
                torch.save(to_save, save_path.replace('.pt', '_best.pt'))
                bestt_score = averagef1
        else:
            print(f"epoch: {i}, train loss: {trainloss[-1]}, train f1score: {trainscore[-1]}")


    return trainloss, validloss, trainscore, validscore


def test(dataloader, model = None, lossfn = None, epochs = 1, has_mask = True):
    """
    Args:
        model: if `pytorch Model` -> normal test. if `list(pytorch Model)` -> ensemble
    """
#     dataloader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    testloss = []
    testscore = []
    if isinstance(model, list):
        for mmm in model:
            mmm.eval()
    else:
        model.eval()
    averageloss = 0
    averagef1 = 0
    averagePrecision = 0
    averageRecall = 0
    overallPred = []
    overallTar = []
    for datas in tqdm(dataloader):
        count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
        target = target.to('cuda')
        if isinstance(model, list):
            ensem = []
            for mmm in model:
                p = mmm(list(tweet), has_mask)
                ensem.append(p[0])
            pred = [ensemble(ensem)]
        else:
            pred = model(list(tweet), has_mask)
        loss = lossfn(pred[0], target.squeeze(1))
        averageloss += loss.item()/len(dataloader)
        overallPred.append(pred[0].argmax(-1).cpu().numpy())
        overallTar.append(target.squeeze(1).cpu().numpy())
#     print(overallPred[:2])
#     print(overallTar[:2])
    predicts = np.concatenate(overallPred)
    targets = np.concatenate(overallTar)
#     print(predicts.shape)
#     print(targets.shape)

    f1score = sklearn.metrics.f1_score(targets, predicts, average = 'weighted')
    precision=sklearn.metrics.precision_score(targets, predicts, average='weighted', zero_division=0)
    recall=sklearn.metrics.recall_score(targets, predicts, average='weighted', zero_division=0)

    print(f"test loss: {averageloss}")
    print(f"test score: {(f1score, precision, recall)}")
    return averageloss, (f1score, precision, recall)
            
def get_csv(dataloader, model = None, has_mask = True):
    """get the prediction csv
    
    Examples:
        o = get_csv(dataset_test, model1)
        o.to_csv(save_path.replace('.pt', '.csv'))
    """
#     dataloader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    OverallPred = []
    for datas in tqdm(dataloader):
        count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
        target = target.to('cuda')
        if isinstance(model, list):
            ensem = []
            for mmm in model:
                mmm.eval()
                p = mmm(list(tweet), has_mask)
                ensem.append(p[0])
            pred = [ensemble(ensem)]
        else:
            model.eval()
            pred = model(list(tweet), has_mask)
        OverallPred.append(pred[0].cpu().detach().numpy())
    output = np.concatenate(OverallPred)
    return pd.DataFrame(output)

In [24]:
trainloss, validloss, trainscore, validscore = train(dataloader_train, dataloader_valid, model = model, optimizer = optimizer, 
      lossfn = lossfn, epochs = epochs, has_mask = True)

In [25]:
# Use best val set model
state = torch.load(save_path.replace('.pt', '_best.pt'))
model.load_state_dict(state['model'])
# Do test
test(dataloader_test, model = model, lossfn = lossfn, has_mask = True)

In [None]:
import matplotlib.pyplot as plt
plt.plot(trainloss)
plt.plot(validloss)
plt.legend(['train loss', 'valid loss'])

In [None]:
torch.tensor(trainscore)[:,0]

In [None]:
plt.plot(torch.tensor(trainscore)[:,0])
plt.plot(torch.tensor(validscore)[:,0])
plt.legend(['train f1', 'valid f1'])

In [None]:
## Below is not used anymore, the best_valid model is auto saved with train()
# to_save = {'model': model.state_dict(),
#            'config': config,
#            'Dconfig': Dconfig,
#            'optimizer': optimizer,
#            'lr_s':None}
# torch.save(to_save, save_path)

o = get_csv(dataset_test, model)
o.to_csv(save_path.replace('.pt', '.csv'))

## Ensemble

In [None]:
# Dconfig = {'Dmodel_name':'BiLSTMModel',
#            'freeze_pretrained':True,
#            'lstm_num_layer':1, 
#            'classifier_dropout':0.1,
#            'last_hidden_layer':5,
#            'epochs': 20}
# model1 = BiLSTMModel(pretrain_model, tokenizer, **Dconfig).to(device)
# state1 = torch.load('../input/brian/biLSTM_layer1.pt')
# assert Dconfig == state1['Dconfig']
# model1.load_state_dict(state1['model'])
# print(test(dataloader_test, model = model1, lossfn = lossfn, has_mask = True))

# Dconfig = {'Dmodel_name':'LastAttnModel',
#            'freeze_pretrained':True,
#            'classifier_dropout':0.1,
#            'auxiliary_head':None,
#            'last_hidden_layer':5,
#            'all_CLS_attn':True,
#            'epochs': 20}
# model2 = LastAttnModel(pretrain_model, tokenizer, **Dconfig).to(device)
# state2 = torch.load('./lastATNN.pt')
# assert Dconfig == state2['Dconfig']
# model2.load_state_dict(state2['model'])
# print(test(dataloader_test, model = model2, lossfn = lossfn, has_mask = True))

# print(test(dataloader_test, model = [model1, model2], lossfn = lossfn, has_mask = True))

# In case You want know what is `pred`  
# Check the attension weight to swear word, only useful for LastAttnModel()

In [None]:
# for datas in dataloader_valid:
#     model.eval()
#     count, hate_speech, offensive_language, neither, target, tweet = datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]
#     pred = model(list(tweet), True)
#     break

In [None]:
# pred, w = pred[0], pred[1]

In [None]:
# idx = 6
# print(f"Prediction: {pred[idx].argmax().item()}. Ground-Truth: {target[idx].item()}")
# print(tweet[idx])
# w_pure = w[idx][w[idx] !=0 ][1:].cpu().detach().numpy()
# tokens = tokenizer.tokenize(tweet[idx])
# full = list(zip(tokens, w_pure))
# display(full)
# # here `w_pure.sum() != 1` because the <cls> token score is not included, so will be small then one.

In [None]:
# # combine tokens to word by handling '##'
# out = []
# out.append(list(full[0]))
# for i in range(1, len(full)):
#     if full[i][0][:2] == '##':
#         out[-1][0] += full[i][0][2:]
#         out[-1][1] += full[i][1]
#     else:
#         out.append(list(full[i]))
# out

In [None]:
# # score by attention score
# sorted(out, key=lambda out: out[1], reverse=True)