In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch import optim
import sys
import random
import math
import time
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

from transformers import BertTokenizer, AutoTokenizer
from transformers import BertModel, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.tensorboard import SummaryWriter

use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# use_cuda=False
# device='cpu'

torch.autograd.set_detect_anomaly(True)
torch.backends.cudnn.benchmark = True
np.random.seed(0)
torch.manual_seed(0)

base_model = 'bert-base-uncased'
model_list = ['bert-base-uncased', 'bert-base-multilingual-uncased', 'google/muril-base-cased', 'xlm-roberta-base',
              'ai4bharat/indic-bert','cardiffnlp/twitter-xlm-roberta-base','cardiffnlp/twitter-xlm-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base', 'cardiffnlp/twitter-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base-hate', 'roberta-base']
model_path = '/mnt/saved_models/'
results_path = '/mnt/saved_results/'

In [3]:
lang = 'hx_entail_flip'
model_choice = 0

In [4]:
# writer = SummaryWriter(log_dir="/home/jupyter/tboard/" + base_model + "_" + lang)
device

device(type='cuda')

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])

MAX_SEQ_LEN = 128

label_idx = 2
text_idx = 0
ltext_idx = 1

class HateData(Dataset):
    def __init__(self, data_path, split='train', lang='bengali'):
        self.split = split
        self.data = pd.read_csv(data_path + lang + "_" + split + ".tsv", sep='\t')
        # self.data = pd.read_parquet(data_path + lang + "_" + split + ".parquet", engine='fastparquet')

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        data = self.data.iloc[index]

        ltext = str(data[ltext_idx])
        text = str(data[text_idx])
        labels = int(data[label_idx])
      
        inputs = tokenizer(text, ltext, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        # print(inputs)
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']#np.zeros(MAX_SEQ_LEN)#
        attn_mask = inputs['attention_mask']

        input_ids = torch.tensor(input_ids, dtype=torch.long).view(MAX_SEQ_LEN)
        token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).view(MAX_SEQ_LEN)
        attn_mask = torch.tensor(attn_mask, dtype=torch.long).view(MAX_SEQ_LEN)
        labels = torch.tensor(labels, dtype=torch.long).view(1)


        return input_ids, attn_mask, token_type_ids, labels


# train_data = HateData(data_path="/home/jupyter/multilingual/train_data/", split='train')
# dataload = DataLoader(train_data, batch_size=4)

# for i in (dataload):
#     print(i[0].shape)
#     print(i[1].shape)
#     print(i[2].shape)
#     print(i[3].shape)
#     break




In [6]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()

        H1, H2, num_class = 768, 128, 2
        self.bert = AutoModel.from_pretrained(model_list[model_choice])
        
        # for param in self.bert.parameters():
        #     param.requires_grad = False

        self.clf = nn.Sequential(
            nn.Linear(H1, H2),
            nn.ReLU(),
            nn.Linear(H2, H2),
            nn.ReLU(),
            nn.Linear(H2, num_class)
        )

        
    def forward(self, input_ids, attn_mask, token_type_ids):  
        outputs = self.bert(input_ids, attn_mask, token_type_ids)
        cls_emb = outputs.pooler_output # (batch, 768)
        logits = self.clf(cls_emb) # (batch, num_class)
        return logits


In [7]:
loss_fn = nn.CrossEntropyLoss()#weight=torch.tensor([1.0,2.0], device=device)

In [8]:
def train(input_ids, attn_mask, token_type_ids, label, model, model_opt, scdl):

    model_opt.zero_grad()

    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]

    loss = 0.0

    if use_cuda:
        input_ids = input_ids.to(device)
        attn_mask = attn_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        label = label.to(device)

    label = label.flatten()
    
    logits = model(input_ids, attn_mask, token_type_ids)

    loss = loss_fn(logits, label)

    # if torch.isnan(loss):
    #     pass
    # else:
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip gradients to prevent exploding
    model_opt.step()
    scdl.step()
    # print(loss)
    return float(loss.item())



In [9]:
def evaluate(input_ids, attn_mask, token_type_ids, label, model, mode='train'):
   
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]


    with torch.no_grad():
        if use_cuda:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)

        label = label.flatten()
        
        logits = model(input_ids, attn_mask, token_type_ids)
        # logits = nn.Softmax(dim=1)(logits)[0]
        loss = loss_fn(logits, label)
        
        if mode == 'train':
            return float(loss.item())
        
        preds = torch.argmax(logits, dim=1).flatten()#[0]
        # acc = (preds == label).cpu().numpy().mean() * 100

        return float(loss.item()), preds.cpu().numpy()
        



In [11]:
# df_test = pd.read_csv("/home/jupyter/data/test_data/hx_test.tsv", sep='\t')#, lineterminator='\n')
df_test = pd.read_csv("/mnt/test_data/hx_entail_flip_test.tsv", sep='\t')
gt_labels = np.array(df_test['class'])

In [12]:
len(gt_labels)# f1_score(torch.tensor(gt_labels), torch.tensor(gt_labels), average='macro')

5772

In [13]:
def trainIters(model, epochs, train_loader, test_loader, learning_rate=3e-5, log_step=180, valid_step=360, mode='train'):

    model_opt = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    num_train_steps = (len(train_loader)*epochs) 
    scdl = get_linear_schedule_with_warmup(model_opt, num_warmup_steps=int(0.1*num_train_steps), num_training_steps=num_train_steps)

    print("Initialised optimizer and lr scheduler")

    # valid_best_loss = [] 
    best_acc = 0.0 
    tot = len(train_data) // train_loader.batch_size
    tot_val = len(val_data) // test_loader.batch_size
    plot_steps = 0
    
    for epoch in range(epochs):
        train_loss_total = 0.0
        train_step = 0
        # Training
        
        model.train()        
        for entry in tqdm(train_loader, total=tot, position=0, leave=True):
            loss = train(entry[0], entry[1], entry[2], entry[3], model, model_opt, scdl)
            plot_steps += 1
            train_step += 1
            # if not math.isnan(loss) :      
            train_loss_total = train_loss_total + loss
            
            train_loss = train_loss_total / train_step
            
            # if plot_steps % log_step == 0:
            #     writer.add_scalar("Train Loss", train_loss, plot_steps)
            
            if (plot_steps % valid_step == 0) or (plot_steps >= num_train_steps - 1):
                model.eval()
                test_pred = []

                for entry in tqdm(test_loader, total=tot_val, position=0, leave=True):
                    loss_v, pred_v = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
                    # if not math.isnan(loss) :      
                    test_pred.extend([pd for pd in pred_v])

                # val_acc = (test_pred == gt_labels).mean().item()
                val_acc = f1_score(gt_labels, test_pred, average='macro')
                print("Validation F1: " + str(val_acc))
                # writer.add_scalar("Val F1", val_acc, plot_steps)


                if val_acc > best_acc:
                    torch.save(model.state_dict(), model_path + "model_" + base_model + "_" + lang + ".pth") 
                    print("Model saved for step: " + str(plot_steps))
                    best_acc = val_acc         

                model.train()
            # writer.flush()
                

        print('epoch: '+str(epoch))
        print('total loss: '+str(train_loss_total/tot))

        # wr_train = open(results_path + "train_loss_" + base_model + ".txt", "a")
        # wr_train.write("epoch " + str(epoch) + ": " + str(train_loss_total/tot) + "\n")
        # wr_train.close()


        

In [14]:
train_data = HateData(data_path="/mnt/train_data/", split='train', lang=lang)
val_data = HateData(data_path="/mnt/test_data/", split='test', lang=lang)

In [15]:
BS = 64
# weights = [1.0]*15383
# weights.extend([0.5]*(len(train_data) - 15383))
# sampler = WeightedRandomSampler(weights, num_samples=20000)

dataload = DataLoader(train_data, batch_size=BS, shuffle=True)
dataload_val = DataLoader(val_data, batch_size=BS, shuffle=False)

In [16]:
(len(train_data)/64)//2

360.0

In [17]:
model = Classifier()
model = model.float()
# model = nn.DataParallel(model)
model = model.to(device)
# model = Classifier()
# model = nn.DataParallel(model)
# model.load_state_dict(torch.load("/mnt/saved_models/model_twitter-xlm-roberta-base-sentiment_multilingual.pth", map_location=device))
# model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainIters(model, 5, dataload, dataload_val)

Initialised optimizer and lr scheduler


 40%|███▉      | 286/721 [02:55<04:22,  1.66it/s]

## 

In [14]:
device

device(type='cuda', index=0)

In [21]:
lang = 'multilingual'

######################## TESTING ######################

In [20]:
model = Classifier()
# model = nn.DataParallel(model)
model.load_state_dict(torch.load("/mnt/saved_models/model_bert-base-uncased_" + lang + ".pth", map_location=device))
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
lang #= 'german'

'hx_entail_flip'

In [21]:
# test_data = HateData(data_path="/home/jupyter/data/test_data/bq_test_" + lang + "_process_10k.csv")
test_data = HateData(data_path="/mnt/test_data/", split='test', lang=lang)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [22]:
model.eval()
test_loss = []
test_pred = []

# wr = open(results_path + "test_prediction_" + base_model + "_" + lang + "_process_10k.txt", "w")     
wr = open(results_path + "test_prediction_" + base_model + "_" + lang + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")

test_loss = np.mean(test_loss)#.item()

print("Test Loss: ", test_loss)

wr.close()

100%|██████████| 5772/5772 [00:46<00:00, 123.59it/s]

Test Loss:  0.44468180327440665





In [23]:
# df_test = pd.read_csv("/home/jupyter/data/test_data/bq_test_" + lang + "_process_10k.csv", sep=',')
# df_test = pd.read_csv("/home/jupyter/data/test_data/hx_test.tsv", sep='\t')#, lineterminator='\n')
df_test = pd.read_csv("/mnt/test_data/hx_entail_flip_test.tsv", sep='\t')
# gt_labels = np.array(df_test['class'])
# mp = {'hate':1, 'normal':0}
# df_test['label'].replace(mp, inplace=True)
# gt_labels = np.array(df_test['class'])

In [30]:
actual_pred = []
gt_labels = []

for i in range(0, len(df_test)-2, 3):
    row = df_test.iloc[i]
    if row[1] == 'this is hateful':
        gt_labels.append(2)
    elif row[1] == 'this is offensive':
        gt_labels.append(1)
    else:
        gt_labels.append(0)
    
    pred_offset = np.argmax([test_pred[i], test_pred[i+1], test_pred[i+2]])
 
    if df_test.iloc[i+pred_offset][1] == 'this is hateful':
        actual_pred.append(2)
    elif df_test.iloc[i+pred_offset][1] == 'this is offensive':
        actual_pred.append(1)
    else:
        actual_pred.append(0)

In [31]:
print(classification_report(gt_labels, actual_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8176    0.9057    0.8594       594
           1     0.7821    0.5894    0.6722       548
           2     0.7925    0.8645    0.8269       782

    accuracy                         0.7989      1924
   macro avg     0.7974    0.7865    0.7862      1924
weighted avg     0.7973    0.7989    0.7929      1924



In [45]:
# actual_pred = []
# gt_labels = []

# for i in range(0, len(df_test), 3):
#     row = df_test.iloc[i]
#     if row[1] == 'this is hateful':
#         gt_labels.append(2)
#         actual_pred.append(1+test_pred[i])
#     elif row[1] == 'this is offensive':
#         gt_labels.append(1)
#         actual_pred.append(test_pred[i])
#     else:
#         gt_labels.append(0)
#         actual_pred.append(1-test_pred[i])

In [None]:
76.39, 73.01 - 16 batch, 3e-5, no token ids
79.68, 77.32 - 64, 3e-5

In [None]:
69.23, 67.69 - entail (64, 3e-5)


In [None]:
latent2 - 80.14, 78.44

In [19]:
test_pred # 79.43,76.89

[array([0]),
 array([2]),
 array([0]),
 array([0]),
 array([0]),
 array([0]),
 array([2]),
 array([2]),
 array([2]),
 array([1]),
 array([0]),
 array([0]),
 array([1]),
 array([2]),
 array([2]),
 array([1]),
 array([1]),
 array([0]),
 array([0]),
 array([2]),
 array([2]),
 array([2]),
 array([0]),
 array([2]),
 array([2]),
 array([2]),
 array([2]),
 array([0]),
 array([2]),
 array([2]),
 array([0]),
 array([2]),
 array([0]),
 array([2]),
 array([0]),
 array([0]),
 array([0]),
 array([0]),
 array([1]),
 array([1]),
 array([0]),
 array([0]),
 array([0]),
 array([2]),
 array([0]),
 array([2]),
 array([2]),
 array([0]),
 array([0]),
 array([1]),
 array([2]),
 array([2]),
 array([1]),
 array([0]),
 array([2]),
 array([2]),
 array([0]),
 array([2]),
 array([2]),
 array([2]),
 array([1]),
 array([1]),
 array([0]),
 array([2]),
 array([2]),
 array([2]),
 array([0]),
 array([0]),
 array([1]),
 array([0]),
 array([2]),
 array([2]),
 array([1]),
 array([2]),
 array([1]),
 array([0]),
 array([2]),

In [None]:
68.76,67.56

In [None]:
68.61,67.2

In [None]:
68.92,67.81

In [None]:
69.5,67.80

In [None]:
69.59,67.86

In [None]:
aug-70.32,68.89