In [1]:
%pip install torch
%pip install transformers
%pip install wandb

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Cache Space

In [2]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
%env HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
env: HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
env: CUDA_LAUNCH_BLOCKING=1


# Import Lib

In [3]:
import yaml
import random, os
import argparse
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from torch.autograd import Variable
from tqdm import tqdm
import transformers
from torch.nn import NLLLoss
import logging
import json
import torch.nn as nn
import re
from torch.autograd import Variable
import wandb
from pathlib import Path
import math


logger = logging.getLogger(__name__)



# Data Loader

In [4]:
def reform_label(tokens, label, tokenizer, max_seq_length):
    new_tokens = list()
    new_label = list()
    for step, token in enumerate(tokens[:-1]):
        split_token = tokenizer.tokenize(token)
        if len(split_token) > 0:
            new_tokens.extend(split_token)
            
    new_label.append(label)
    new_tokens = new_tokens[:max_seq_length] + ['EOS']
    
    return new_tokens, new_label

def tok2int_sent(example, tokenizer, max_seq_length):
    src_tokens = example[0]
    src_label = example[1]

    src_tokens, src_label = reform_label(src_tokens, src_label, tokenizer, max_seq_length)

    tokens = src_tokens
    tokens = ["[CLS]"] + tokens
    label = src_label
    input_seg = [1] * len(tokens)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    max_len = max_seq_length * 2 + 3
    padding = [0] * (max_len - len(input_ids))
    input_ids += padding
    input_mask += padding
    input_seg += padding

    return input_ids, input_mask, input_seg, label

def tok2int_list(data, tokenizer, max_seq_length):
    inps = list()
    msks = list()
    segs = list()
    labs = list()
    for examples in data:
        input_ids, input_mask, input_seg, labels = tok2int_sent(examples, tokenizer, max_seq_length)
        inps.append(input_ids)
        msks.append(input_mask)
        segs.append(input_seg)
        labs.append(labels)
        
    return inps, msks, segs, labs


class DataLoader(object):
    ''' For data iteration '''

    def __init__(self, data_path, tokenizer, args, test=False, batch_size=64):
        self.cuda = args.cuda
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_len = args.max_len
        self.data_path = data_path
        self.test = test
        self.src_flag = args.src_flag
        self.hyp_flag = args.hyp_flag
        examples = self.read_file(data_path)
        self.examples = examples
        self.total_num = len(examples)
        if self.test:
            self.total_step = np.ceil(self.total_num * 1.0 / batch_size)
        else:
            self.total_step = self.total_num / batch_size
            self.shuffle()
        self.step = 0
    
    def read_file(self, data_path):
        data_list = list()
        data = pd.read_csv(data_path)
        data = data[data['grammarScore']>0]
        data.dropna(inplace=True)

        for _ , row in data.iterrows():
            example = list()
            line = row['transcript']
            src_token = line.split()
            src_label = float(row['grammarScore'])
            example.append(src_token)
            example.append(src_label)
            data_list.append(example)
        return data_list


    def shuffle(self):
        np.random.shuffle(self.examples)

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def __len__(self):
        return self._n_batch

    def next(self):
        ''' Get the next batch '''
        if self.step < self.total_step:
            examples = self.examples[self.step * self.batch_size : (self.step+1)*self.batch_size]
            
            inp, msk, seg, score = tok2int_list(examples, self.tokenizer, self.max_len)

            inp_tensor = Variable(
                torch.LongTensor(inp))
            msk_tensor = Variable(
                torch.LongTensor(msk))
            seg_tensor = Variable(
                torch.LongTensor(seg))
            score_tensor = Variable(
                torch.LongTensor(score))

            if self.cuda:
                inp_tensor = inp_tensor.cuda()
                msk_tensor = msk_tensor.cuda()
                seg_tensor = seg_tensor.cuda()
                score_tensor = score_tensor.cuda()

            self.step += 1
            return inp_tensor, msk_tensor, seg_tensor, score_tensor

        else:
            self.step = 0
            if not self.test:
                self.shuffle()
            raise StopIteration()

# Model Eval

In [5]:
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np

def eval_result(predicts, labels):
    spearman_corr, _ = spearmanr(predicts, labels)
    corr, _ = pearsonr(predicts, labels)
    res = {"Prearson Corr":corr,"Spearman Corr":spearman_corr,"Eval Loss":np.sqrt(mean_squared_error(predicts, labels))}
    return res

def eval_model(model, validset_reader):
    model.eval()
    predicts = list()
    labels = list()
    with torch.no_grad():
        for step, (inp_tensor, msk_tensor, seg_tensor, score_tensor) in tqdm(enumerate(validset_reader)):
            # print("Eval Input --",inp_tensor)
            # print("Eval msk_tensor --",msk_tensor)
            # print("Eval seg_tensor --",seg_tensor)
            
            score_tensor = score_tensor.to(torch.float)
            prob = model(inp_tensor, msk_tensor, seg_tensor)

            # print("Eval output---->",prob.view(-1).tolist())
            
            predict = prob.type_as(score_tensor).view(-1).tolist()
            score = score_tensor.view(-1).tolist()
            predicts.extend(predict)
            labels.extend(score)
            
        results = eval_result(predicts, labels)
            
    return results

In [6]:
# list_net_loss

def listnet_loss(y_i, z_i):
    P_y_i = F.softmax(y_i.float(), dim=0)
    P_z_i = F.softmax(z_i.float(), dim=0)
    return - torch.sum(P_y_i * torch.log(P_z_i))

# Train

In [7]:
# Co teaching

In [8]:
def train_model_co_teaching(model1, model2, args, trainset_reader, validset_reader):
    saved_checkpoints1 = []
    saved_checkpoints2 = []
    
    save_path = args.outdir
    forget_rate = args.forget_rate 
    num_gradual = int(args.num_train_epochs / 2)
    running_loss1 = 0.0
    running_loss2 = 0.0
    running_loss  = 0.0

    t_total = int(
        trainset_reader.total_step / args.gradient_accumulation_steps * args.num_train_epochs
    )

    optimizer1 = transformers.AdamW(model1.parameters(), lr=args.learning_rate, eps=1e-8)
    optimizer2 = transformers.AdamW(model2.parameters(), lr=args.learning_rate, eps=1e-8)

    scheduler1 = transformers.get_linear_schedule_with_warmup(
        optimizer1, num_warmup_steps=0, num_training_steps=t_total
    )
    scheduler2 = transformers.get_linear_schedule_with_warmup(
        optimizer2, num_warmup_steps=0, num_training_steps=t_total
    )

    global_step = 0
    for epoch in range(int(args.num_train_epochs)):
        model1.train()
        model2.train()

        # current_forget_rate = min(forget_rate, forget_rate * (1 - epoch / num_gradual))
        current_forget_rate = forget_rate

        for inp_tensor, msk_tensor, seg_tensor, score_tensor in tqdm(trainset_reader):
            optimizer1.zero_grad()
            optimizer2.zero_grad()

            pred1 = model1(inp_tensor, msk_tensor, seg_tensor).view(-1)
            pred2 = model2(inp_tensor, msk_tensor, seg_tensor).view(-1)

            score_tensor = score_tensor.view(-1).to(torch.float)

            loss1 = F.mse_loss(pred1, score_tensor, reduction='none')
            loss2 = F.mse_loss(pred2, score_tensor, reduction='none')

            idx1_sorted = torch.argsort(loss1).detach()
            idx2_sorted = torch.argsort(loss2).detach()

            num_remember = int((1 - current_forget_rate) * len(loss1))

            idx1 = idx1_sorted[:num_remember]
            idx2 = idx2_sorted[:num_remember]

            clean_loss1 = loss1[idx2].mean()
            clean_loss2 = loss2[idx1].mean()
            
            clean_loss1.backward()
            clean_loss2.backward()

            running_loss1 += clean_loss1.item()
            running_loss2 += clean_loss2.item()
            running_loss += (clean_loss1.item()+clean_loss2.item())

            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer1.step()
                optimizer2.step()
                scheduler1.step()
                scheduler2.step()
                optimizer1.zero_grad()
                optimizer2.zero_grad()

        logger.info(
            f'Epoch: {epoch},Model Loss:{running_loss/global_step}, Loss Model1: {running_loss1 / global_step}, Loss Model2: {running_loss2 / global_step}, Forget Rate: {current_forget_rate}'
        )
        
        train_res = {
            "Train Loss": running_loss / global_step,
            "Train Loss 1": running_loss1 / global_step,
            "Train Loss 2": running_loss2 / global_step,
            "Learning Rate Model 1": scheduler1.get_last_lr()[0],
            "Learning Rate Model 2": scheduler2.get_last_lr()[0],
        }

        logger.info('Start eval for Model 1!')
        result_dict1 = eval_model(model1, validset_reader)
        logger.info(result_dict1)

        logger.info('Start eval for Model 2!')
        result_dict2 = eval_model(model2, validset_reader)
        logger.info(result_dict2)

        train_res.update({"Model 1 Validation": result_dict1, "Model 2 Validation": result_dict2})
        wandb.log(train_res)

        check_point_path1 = save_path + f"/model1_{epoch}_best.pt"
        check_point_path2 = save_path + f"/model2_{epoch}_best.pt"

        torch.save({'epoch': epoch,
                    'model': model1.state_dict()},check_point_path1)
        torch.save({'epoch': epoch,
            'model': model2.state_dict()},check_point_path2)
        
        saved_checkpoints1.append(check_point_path1)
        saved_checkpoints2.append(check_point_path2)

        if len(saved_checkpoints1) > args.max_model_save:
            old_checkpoint1 = saved_checkpoints1.pop(0)
            old_checkpoint2 = saved_checkpoints2.pop(0)
            
            if os.path.exists(old_checkpoint1):
                os.remove(old_checkpoint1)
                
            if os.path.exists(old_checkpoint2):
                os.remove(old_checkpoint2)


In [9]:
# Co Teaching Plus

In [10]:
def train_model_co_teaching_plus(model1, model2, args, trainset_reader, validset_reader):
    saved_checkpoints1 = []
    saved_checkpoints2 = []
    
    save_path = args.outdir
    best_acc = 0.0
    running_loss = 0.0
    running_loss1 = 0.0
    running_loss2 = 0.0

    forget_rate = args.forget_rate
    num_gradual = int(args.num_train_epochs / 2)
    rate_schedule = np.linspace(forget_rate, 0, num_gradual).tolist()

    t_total = int(trainset_reader.total_step / args.gradient_accumulation_steps * args.num_train_epochs)

    optimizer1 = transformers.AdamW(model1.parameters(), lr=args.learning_rate, eps=1e-8)
    optimizer2 = transformers.AdamW(model2.parameters(), lr=args.learning_rate, eps=1e-8)

    scheduler1 = transformers.get_linear_schedule_with_warmup(
        optimizer1, num_warmup_steps=0, num_training_steps=t_total
    )
    scheduler2 = transformers.get_linear_schedule_with_warmup(
        optimizer2, num_warmup_steps=0, num_training_steps=t_total
    )

    global_step = 0
    for epoch in range(int(args.num_train_epochs)):
        optimizer1.zero_grad()
        optimizer2.zero_grad()

        current_forget_rate = rate_schedule[epoch] if epoch < num_gradual else 0.0

        for inp_tensor, msk_tensor, seg_tensor, score_tensor in tqdm(trainset_reader):
            model1.train()
            model2.train()

            pred_score1 = model1(inp_tensor, msk_tensor, seg_tensor).view(-1)
            pred_score2 = model2(inp_tensor, msk_tensor, seg_tensor).view(-1)

            score_tensor = score_tensor.view(-1).to(torch.float)
            
            ##** Loss Based Selection **##

#             loss1 = F.mse_loss(pred_score1, score_tensor, reduction='none')
#             loss2 = F.mse_loss(pred_score2, score_tensor, reduction='none')

#             idx1_sorted = torch.argsort(loss1).detach()
#             idx2_sorted = torch.argsort(loss2).detach()

#             num_remember = int((1 - current_forget_rate) * len(loss1))

#             idx1_small = idx1_sorted[:num_remember]
#             idx2_small = idx2_sorted[:num_remember]

#             loss1_final = F.mse_loss(pred_score1[idx2_small], score_tensor[idx2_small])
#             loss2_final = F.mse_loss(pred_score2[idx1_small], score_tensor[idx1_small])

#             loss1_final.backward()
#             loss2_final.backward()    

            ##** Agreement Based Selection **##
    
            loss1 = F.mse_loss(pred_score1, score_tensor, reduction='none')
            loss2 = F.mse_loss(pred_score2, score_tensor, reduction='none')
            
            pred_diff = torch.abs(pred_score1 - pred_score2)
            agreement_mask = pred_diff <= args.agreement_threshold

            loss1_agreed = loss1[agreement_mask]
            loss2_agreed = loss2[agreement_mask]

            idx1_sorted = torch.argsort(loss1_agreed).detach()
            idx2_sorted = torch.argsort(loss2_agreed).detach()

            num_remember = int((1 - current_forget_rate) * len(loss1_agreed))

            idx1_small = idx1_sorted[:num_remember]
            idx2_small = idx2_sorted[:num_remember]

            loss1_final = F.mse_loss(pred_score1[idx2_small], score_tensor[idx2_small])
            loss2_final = F.mse_loss(pred_score2[idx1_small], score_tensor[idx1_small])

            loss1_final.backward()
            loss2_final.backward()
    
            running_loss1 += loss1_final.item()
            running_loss1 += loss2_final.item()
            running_loss += (loss1_final.item() + loss2_final.item())
            
            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer1.step()
                optimizer2.step()
                scheduler1.step()
                scheduler2.step()
                optimizer1.zero_grad()
                optimizer2.zero_grad()

        logger.info('Epoch: {}, Loss:{}, Loss1:{}, Loss2: {}, LR1: {}, LR2: {}'.format(epoch, running_loss / global_step,running_loss1 / global_step, running_loss2 / global_step, scheduler1.get_last_lr()[0], scheduler2.get_last_lr()[0]))

        train_res = {
            "Train Loss": running_loss / global_step,
            "Train Loss 1": running_loss1 / global_step,
            "Train Loss 2": running_loss2 / global_step,
            "Learning Rate Model 1": scheduler1.get_last_lr()[0],
            "Learning Rate Model 2": scheduler2.get_last_lr()[0],
        }

        wandb.log(train_res)

        logger.info('Start eval for Model 1!')
        result_dict1 = eval_model(model1, validset_reader)
        logger.info(result_dict1)

        logger.info('Start eval for Model 2!')
        result_dict2 = eval_model(model2, validset_reader)
        logger.info(result_dict2)

        train_res.update({"Model 1 Validation": result_dict1, "Model 2 Validation": result_dict2})
        wandb.log(train_res)

        check_point_path1 = save_path + f"/model1_{epoch}_best.pt"
        check_point_path2 = save_path + f"/model2_{epoch}_best.pt"

        torch.save({'epoch': epoch,
                    'model': model1.state_dict()},check_point_path1)
        torch.save({'epoch': epoch,
            'model': model2.state_dict()},check_point_path2)
        
        saved_checkpoints1.append(check_point_path1)
        saved_checkpoints2.append(check_point_path2)

        if len(saved_checkpoints1) > args.max_model_save:
            old_checkpoint1 = saved_checkpoints1.pop(0)
            old_checkpoint2 = saved_checkpoints2.pop(0)
            
            if os.path.exists(old_checkpoint1):
                os.remove(old_checkpoint1)
                
            if os.path.exists(old_checkpoint2):
                os.remove(old_checkpoint2)


# Attention Layer

In [11]:
class inference_model(nn.Module):
    def __init__(self, bert_model, args):
        super(inference_model, self).__init__()
        self.bert_hidden_dim = args.bert_hidden_dim
        self.pred_model = bert_model
        self.model_name = args.bert_pretrain
        self.max_len = args.max_len * 2 + 3
        self.proj_hidden = nn.Linear(self.bert_hidden_dim, 1)


    def forward(self, inp_tensor, msk_tensor, seg_tensor,score_flag=True):
        inp_tensor = inp_tensor.view(-1, self.max_len)
        msk_tensor = msk_tensor.view(-1, self.max_len)
        seg_tensor = seg_tensor.view(-1, self.max_len)
        
        if "bert" in self.model_name.lower():
            outputs = self.pred_model(inp_tensor, msk_tensor, seg_tensor)            
        elif "electra" in self.model_name.lower():
            outputs = self.pred_model(inp_tensor, msk_tensor)
        else:
            BaseException ("Not implement!")
        
        pred_score = self.proj_hidden(outputs.pooler_output)

        return pred_score

In [12]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

if __name__ == "__main__":
    config_file = 'config.yaml'
    with open(config_file, "r") as ymlfile:
        config_dict = yaml.load(ymlfile, Loader=yaml.FullLoader)

    args = Config(**config_dict)

    if not os.path.exists(args.outdir):
        Path(args.outdir).mkdir(parents=True, exist_ok=True)
        
    handlers = [logging.FileHandler(os.path.abspath(args.outdir) + '/train_log.txt'), logging.StreamHandler()]
    logging.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=logging.DEBUG,
                        datefmt='%d-%m-%Y %H:%M:%S', handlers=handlers)
    logger.info(args)

    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    
    logger.info('Start training!')
    logger.info("loading training set")
    trainset_reader = DataLoader(args.train_path, tokenizer, args, batch_size=args.train_batch_size)
    logger.info("loading validation set")
    validset_reader = DataLoader(args.test_path, tokenizer, args, batch_size=args.valid_batch_size)
    logger.info('initializing estimator model')

[27-11-2024 02:30:06] INFO: <__main__.Config object at 0x7f44c02a7940>
[27-11-2024 02:30:06] DEBUG: Starting new HTTPS connection (1): huggingface.co:443
[27-11-2024 02:30:07] DEBUG: https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer_config.json HTTP/11" 200 0
[27-11-2024 02:30:07] INFO: Start training!
[27-11-2024 02:30:07] INFO: loading training set
[27-11-2024 02:30:08] INFO: loading validation set
[27-11-2024 02:30:08] INFO: initializing estimator model


In [13]:
model_path = '/home/ec2-user/SageMaker/bert_model/model_best.pt'
model = torch.load(model_path)

config_path = '/home/ec2-user/SageMaker/bert_model/config.json'
config = transformers.BertConfig.from_json_file(config_path)

bert_model_1 = transformers.AutoModel.from_pretrained(model_path, config=config)

bert_model_2 = transformers.BertModel.from_pretrained("bert-base-uncased")

Some weights of BertModel were not initialized from the model checkpoint at /home/ec2-user/SageMaker/bert_model/model_best.pt and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.outp

In [14]:
bert_model_1 = bert_model_1.cuda()
bert_model_2 = bert_model_2.cuda()

ori_model1 = inference_model(bert_model_1, args)
ori_model2 = inference_model(bert_model_2, args)

In [15]:
model_1 = ori_model1
model_2 = ori_model2

model_1 = model_1.cuda()
model_2 = model_2.cuda()

In [None]:
wandb.login(key=args.wandb_key)
wandb.init(project=args.wandb_proj_name, config=args, name=args.wandb_run_name)

# train_model_co_teaching_plus(model_1,model_2, args, trainset_reader, validset_reader)
train_model_co_teaching(model_1,model_2, args, trainset_reader, validset_reader)

wandb.finish()

[27-11-2024 02:30:12] DEBUG: Starting new HTTPS connection (1): ip-172-16-95-62.ap-south-1.compute.internal:8443
[27-11-2024 02:30:12] ERROR: Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[27-11-2024 02:30:12] DEBUG: Starting new HTTPS connection (1): api.wandb.ai:443
[27-11-2024 02:30:12] DEBUG: https://api.wandb.ai:443 "POST /graphql HTTP/11" 200 1997
[27-11-2024 02:30:12] DEBUG: https://api.wandb.ai:443 "POST /graphql HTTP/11" 200 374
[34m[1mwandb[0m: Currently logged in as: [33mshubham-kumar1[0m ([33mshubham-kumar1-shl[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc
[27-11-2024 02:30:12] DEBUG: Starting new HTTPS connection (1): api.wandb.ai:443
[27-11-2024 0

1252it [52:41,  2.53s/it]
[27-11-2024 03:22:55] INFO: Epoch: 0,Model Loss:1.9041461065506782, Loss Model1: 0.9592586560680653, Loss Model2: 0.944887450482613, Forget Rate: 0.2
[27-11-2024 03:22:55] INFO: Start eval for Model 1!
68it [00:28,  2.38it/s]
[27-11-2024 03:23:24] INFO: {'Prearson Corr': -0.008571035167081922, 'Spearman Corr': 0.000946886729483648, 'Eval Loss': 1.3891738395634985}
[27-11-2024 03:23:24] INFO: Start eval for Model 2!
68it [00:28,  2.37it/s]
[27-11-2024 03:23:53] INFO: {'Prearson Corr': 0.4084534936163933, 'Spearman Corr': 0.41493437381684706, 'Eval Loss': 1.2912103416471448}
1252it [52:42,  2.53s/it]
[27-11-2024 04:16:36] INFO: Epoch: 1,Model Loss:1.807532645613193, Loss Model1: 0.9058576483672229, Loss Model2: 0.9016749972459702, Forget Rate: 0.2
[27-11-2024 04:16:36] INFO: Start eval for Model 1!
68it [00:28,  2.38it/s]
[27-11-2024 04:17:05] INFO: {'Prearson Corr': 0.1319728148230159, 'Spearman Corr': 0.1463363949953545, 'Eval Loss': 1.7286005455723192}
[27-11