In [1]:
%pip install torch
%pip install transformers
%pip install wandb

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Cache Space

In [2]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
%env HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
env: HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
env: CUDA_LAUNCH_BLOCKING=1


# Import Lib

In [3]:
import yaml
import random, os
import argparse
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from torch.autograd import Variable
from tqdm import tqdm
import transformers
from torch.nn import NLLLoss
import logging
import json
import torch.nn as nn
import re
from torch.autograd import Variable
import wandb
from pathlib import Path
import math


logger = logging.getLogger(__name__)



# Data Loader

In [4]:
def reform_label(tokens, label, tokenizer, max_seq_length):
    new_tokens = list()
    new_label = list()
    for step, token in enumerate(tokens[:-1]):
        split_token = tokenizer.tokenize(token)
        if len(split_token) > 0:
            new_tokens.extend(split_token)
            
    new_label.append(label)
    new_tokens = new_tokens[:max_seq_length] + ['EOS']
    
    return new_tokens, new_label

def tok2int_sent(example, tokenizer, max_seq_length):
    src_tokens = example[0]
    src_label = example[1]

    src_tokens, src_label = reform_label(src_tokens, src_label, tokenizer, max_seq_length)

    tokens = src_tokens
    tokens = ["[CLS]"] + tokens
    label = src_label
    input_seg = [1] * len(tokens)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    max_len = max_seq_length * 2 + 3
    padding = [0] * (max_len - len(input_ids))
    input_ids += padding
    input_mask += padding
    input_seg += padding

    return input_ids, input_mask, input_seg, label

def tok2int_list(data, tokenizer, max_seq_length):
    inps = list()
    msks = list()
    segs = list()
    labs = list()
    for examples in data:
        input_ids, input_mask, input_seg, labels = tok2int_sent(examples, tokenizer, max_seq_length)
        inps.append(input_ids)
        msks.append(input_mask)
        segs.append(input_seg)
        labs.append(labels)
        
    return inps, msks, segs, labs


class DataLoader(object):
    ''' For data iteration '''

    def __init__(self, data_path, tokenizer, args, test=False, batch_size=64):
        self.cuda = args.cuda
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_len = args.max_len
        self.data_path = data_path
        self.test = test
        self.src_flag = args.src_flag
        self.hyp_flag = args.hyp_flag
        examples = self.read_file(data_path)
        self.examples = examples
        self.total_num = len(examples)
        if self.test:
            self.total_step = np.ceil(self.total_num * 1.0 / batch_size)
        else:
            self.total_step = self.total_num / batch_size
            self.shuffle()
        self.step = 0
    
    def read_file(self, data_path):
        data_list = list()
        data = pd.read_csv(data_path)
        data = data[data['grammarScore']>0]
        data.dropna(inplace=True)

        for _ , row in data.iterrows():
            example = list()
            line = row['transcript']
            src_token = line.split()
            src_label = float(row['grammarScore'])
            example.append(src_token)
            example.append(src_label)
            data_list.append(example)
        return data_list


    def shuffle(self):
        np.random.shuffle(self.examples)

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def __len__(self):
        return self._n_batch

    def next(self):
        ''' Get the next batch '''
        if self.step < self.total_step:
            examples = self.examples[self.step * self.batch_size : (self.step+1)*self.batch_size]
            
            inp, msk, seg, score = tok2int_list(examples, self.tokenizer, self.max_len)

            inp_tensor = Variable(
                torch.LongTensor(inp))
            msk_tensor = Variable(
                torch.LongTensor(msk))
            seg_tensor = Variable(
                torch.LongTensor(seg))
            score_tensor = Variable(
                torch.LongTensor(score))

            if self.cuda:
                inp_tensor = inp_tensor.cuda()
                msk_tensor = msk_tensor.cuda()
                seg_tensor = seg_tensor.cuda()
                score_tensor = score_tensor.cuda()

            self.step += 1
            return inp_tensor, msk_tensor, seg_tensor, score_tensor

        else:
            self.step = 0
            if not self.test:
                self.shuffle()
            raise StopIteration()

# Model Eval

In [5]:
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np

def eval_result(predicts, labels):
    spearman_corr, _ = spearmanr(predicts, labels)
    corr, _ = pearsonr(predicts, labels)
    res = {"Prearson Corr":corr,"Spearman Corr":spearman_corr,"Eval Loss":np.sqrt(mean_squared_error(predicts, labels))}
    return res

def eval_model(model, validset_reader):
    model.eval()
    predicts = list()
    labels = list()
    with torch.no_grad():
        for step, (inp_tensor, msk_tensor, seg_tensor, score_tensor) in tqdm(enumerate(validset_reader)):
            # print("Eval Input --",inp_tensor)
            # print("Eval msk_tensor --",msk_tensor)
            # print("Eval seg_tensor --",seg_tensor)
            
            score_tensor = score_tensor.to(torch.float)
            prob = model(inp_tensor, msk_tensor, seg_tensor)

            # print("Eval output---->",prob.view(-1).tolist())
            
            predict = prob.type_as(score_tensor).view(-1).tolist()
            score = score_tensor.view(-1).tolist()
            predicts.extend(predict)
            labels.extend(score)
            
        results = eval_result(predicts, labels)
            
    return results

In [6]:
# list_net_loss

def listnet_loss(y_i, z_i):
    P_y_i = F.softmax(y_i.float(), dim=0)
    P_z_i = F.softmax(z_i.float(), dim=0)
    return - torch.sum(P_y_i * torch.log(P_z_i))

# Train

In [7]:
def train_model_mentor_net(mentor_model, student_model, args, trainset_reader, validset_reader):
    saved_checkpoints1 = []
    saved_checkpoints2 = []
    
    save_path = args.outdir
    running_loss = 0.0
    meta_loss_mentor = 0.0

    t_total = int(trainset_reader.total_step / args.gradient_accumulation_steps * args.num_train_epochs)

    optimizer_mentor = transformers.AdamW(mentor_model.parameters(), lr=args.learning_rate, eps=1e-8)
    optimizer_student = transformers.AdamW(student_model.parameters(), lr=args.learning_rate, eps=1e-8)

    scheduler_student = transformers.get_linear_schedule_with_warmup(
        optimizer_student, num_warmup_steps=0, num_training_steps=t_total
    )

    global_step = 0
    for epoch in range(int(args.num_train_epochs)):
        optimizer_mentor.zero_grad()
        optimizer_student.zero_grad()
        mentor_model.train()
        student_model.train()

        for inp_tensor, msk_tensor, seg_tensor, score_tensor in tqdm(trainset_reader):
            
            pred_student = student_model(inp_tensor, msk_tensor, seg_tensor).view(-1)

            score_tensor = score_tensor.view(-1).to(torch.float)
    
            loss_student = F.mse_loss(pred_student, score_tensor, reduction='none')
            
            with torch.no_grad():
                sample_features = loss_student
                weights = mentor_model(sample_features)
            
            weighted_loss_student = (weights * loss_student).mean()
    
            running_loss += weighted_loss_student.item()
            
            weighted_loss_student.backward()
            
            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer_student.step()
                scheduler_student.step()
                optimizer_student.zero_grad()

            
        mentor_model.train()
        for meta_inp_tensor, meta_msk_tensor, meta_seg_tensor, meta_score_tensor in validset_reader:
            with torch.no_grad():
                meta_student_output = student_model(meta_inp_tensor, meta_msk_tensor, meta_seg_tensor)  
                meta_pred_score = meta_student_output.view(-1)
                meta_score_tensor = meta_score_tensor.view(-1)
            
            meta_loss = F.mse_loss(meta_pred_score, meta_score_tensor.to(torch.float))
            meta_loss_mentor += meta_loss
            
            optimizer_mentor.zero_grad()
            meta_loss.backward()
            optimizer_mentor.step()
            
        logger.info('Epoch: {}, Student Loss:{}, Meta Loss:{}, Loss2: {}, LR1: {}, LR2: {}'.format(epoch, running_loss / global_step,running_loss / global_step, meta_loss_mentor / global_step, scheduler_student.get_last_lr()[0]))

        train_res = {
            "Student Train Loss": running_loss / global_step,
            "Mentor Meta Loss": meta_loss_mentor / global_step,
            "Learning": scheduler_student.get_last_lr()[0],
        }

        logger.info('Start eval for Model 1!')
        result_dict1 = eval_model(student_model, validset_reader)
        logger.info(result_dict1)
        
        logger.info('Start eval for Model 2!')
        result_dict2 = eval_model(mentor_model, validset_reader)
        logger.info(result_dict2)

        train_res.update({"Student Model Validation": result_dict1, "Mentor Model Validation": result_dict2})
        wandb.log(train_res)
        
        check_point_path1 = save_path + f"/student_mode_{epoch}_best.pt"
        check_point_path2 = save_path + f"/mentor_model_{epoch}_best.pt"

        torch.save({'epoch': epoch,
                    'model': student_model.state_dict()},check_point_path1)
        torch.save({'epoch': epoch,
            'model': mentor_model.state_dict()},check_point_path2)
        
        saved_checkpoints1.append(check_point_path1)
        saved_checkpoints2.append(check_point_path2)

        if len(saved_checkpoints1) > args.max_model_save:
            old_checkpoint1 = saved_checkpoints1.pop(0)
            old_checkpoint2 = saved_checkpoints2.pop(0)
            
            if os.path.exists(old_checkpoint1):
                os.remove(old_checkpoint1)
                
            if os.path.exists(old_checkpoint2):
                os.remove(old_checkpoint2)


# Attention Layer

In [8]:
class inference_model(nn.Module):
    def __init__(self, bert_model, args):
        super(inference_model, self).__init__()
        self.bert_hidden_dim = args.bert_hidden_dim
        self.pred_model = bert_model
        self.model_name = args.bert_pretrain
        self.max_len = args.max_len * 2 + 3
        self.proj_hidden = nn.Linear(self.bert_hidden_dim, 1)


    def forward(self, inp_tensor, msk_tensor, seg_tensor,score_flag=True):
        inp_tensor = inp_tensor.view(-1, self.max_len)
        msk_tensor = msk_tensor.view(-1, self.max_len)
        seg_tensor = seg_tensor.view(-1, self.max_len)
        
        if "bert" in self.model_name.lower():
            outputs = self.pred_model(inp_tensor, msk_tensor, seg_tensor)
            
        else:
            BaseException ("Not implement!")
        
        pred_score = self.proj_hidden(outputs.pooler_output)

        return pred_score

In [None]:
class MENOTR_MODEL(nn.Module):
    def __init__(self,args):
        super(MENOTR_MODEL, self).__init__()
        self.hidden_dim = args.train_batch_size
        self.fc = nn.Sequential(
            nn.Linear(self.hidden_dim, self.hidden_dim *2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim *2,self.hidden_dim),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, x):
        return self.fc(x)

In [9]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

if __name__ == "__main__":
    config_file = 'config.yaml'
    with open(config_file, "r") as ymlfile:
        config_dict = yaml.load(ymlfile, Loader=yaml.FullLoader)

    args = Config(**config_dict)

    if not os.path.exists(args.outdir):
        Path(args.outdir).mkdir(parents=True, exist_ok=True)
        
    handlers = [logging.FileHandler(os.path.abspath(args.outdir) + '/train_log.txt'), logging.StreamHandler()]
    logging.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=logging.DEBUG,
                        datefmt='%d-%m-%Y %H:%M:%S', handlers=handlers)
    logger.info(args)

    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    
    logger.info('Start training!')
    logger.info("loading training set")
    trainset_reader = DataLoader(args.train_path, tokenizer, args, batch_size=args.train_batch_size)
    logger.info("loading validation set")
    validset_reader = DataLoader(args.test_path, tokenizer, args, batch_size=args.valid_batch_size)
    logger.info('initializing estimator model')

[26-11-2024 02:47:45] INFO: <__main__.Config object at 0x7fbd74397f10>
[26-11-2024 02:47:45] DEBUG: Starting new HTTPS connection (1): huggingface.co:443
[26-11-2024 02:47:45] DEBUG: https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tokenizer_config.json HTTP/11" 200 0
[26-11-2024 02:47:45] INFO: Start training!
[26-11-2024 02:47:45] INFO: loading training set
[26-11-2024 02:47:47] INFO: loading validation set
[26-11-2024 02:47:47] INFO: initializing estimator model


In [10]:
bert_model_2 = transformers.BertModel.from_pretrained("bert-base-uncased")

Some weights of BertModel were not initialized from the model checkpoint at /home/ec2-user/SageMaker/bert_model/model_best.pt and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.outp

In [11]:
bert_model_2 = bert_model_2.cuda()

ori_model1 = MENOTR_MODEL(args)
ori_model2 = inference_model(bert_model_2, args)

In [12]:
model_1 = ori_model1
model_2 = ori_model2

mentor_model = model_1.cuda()
student_model = model_2.cuda()

In [13]:
wandb.login(key=args.wandb_key)
wandb.init(project=args.wandb_proj_name, config=args, name=args.wandb_run_name)

train_model_mentor_net(mentor_model,student_model, args, trainset_reader, validset_reader)

wandb.finish()

[26-11-2024 02:47:53] DEBUG: Starting new HTTPS connection (1): ip-172-16-95-62.ap-south-1.compute.internal:8443
[26-11-2024 02:47:53] ERROR: Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[26-11-2024 02:47:53] DEBUG: Starting new HTTPS connection (1): api.wandb.ai:443
[26-11-2024 02:47:53] DEBUG: https://api.wandb.ai:443 "POST /graphql HTTP/11" 200 1990
[26-11-2024 02:47:53] DEBUG: https://api.wandb.ai:443 "POST /graphql HTTP/11" 200 374
[34m[1mwandb[0m: Currently logged in as: [33mshubham-kumar1[0m ([33mshubham-kumar1-shl[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc
[26-11-2024 02:47:53] DEBUG: Starting new HTTPS connection (1): api.wandb.ai:443
[26-11-2024 0

0it [00:00, ?it/s]


TypeError: inference_model.forward() missing 2 required positional arguments: 'msk_tensor' and 'seg_tensor'