In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import argparse
import logging
import time
from tqdm import tqdm
import random
import json
import numpy as np
from torch.backends import cudnn
from torch.utils.data import Dataset

import torch
from torch.utils.data import DataLoader, Dataset
from src.data_utils import *
from src.eval_utils import *
from src.TSCR import Span_Evalution

from src.t5 import MyT5ForConditionalGeneration

from transformers import  AutoModelForSeq2SeqLM,AutoTokenizer, AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
logger = logging.getLogger(__name__)

def init_args():
    parser = argparse.ArgumentParser()
    # basic settings
    parser.add_argument("--model_name_or_path", default='/home/dell/peft/mt0-base', type=str,
                        help="Path to pre-trained model or shortcut name")
    parser.add_argument("--train_dataset", default='/home/dell/peft/seq2seq/datasets/JD_new_train.json', type=str)
    parser.add_argument("--meituan_train_dataset", default='/home/dell/peft/seq2seq/datasets/meituan_train_dataset.json', type=str)
    parser.add_argument("--test_dataset", default='/home/dell/peft/seq2seq/datasets/JD_new_test.json', type=str)
    parser.add_argument("--input_max_len", default=160, type=int)
    parser.add_argument("--target_max_len", default=350, type=int)

    # other parameters
    parser.add_argument("--max_seq_length", default=350, type=int)
    parser.add_argument("--train_batch_size", default=5, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--eval_batch_size", default=10, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument("--num_train_epochs", default=100, type=int,
                        help="Total number of training epochs to perform.")

    # training details
    parser.add_argument("--weight_decay", default=0.0, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--warmup_steps", default=0.0, type=float)
    
    args = parser.parse_args(args=[])

    # set up output dir which looks like './outputs/rest15/'
    if not os.path.exists('./outputs'):
        os.mkdir('./outputs')

    return args

In [None]:
# initialization
args = init_args()
args.output_dir = './outputs'

In [None]:
def validation(args, model, dataloader, tokenizer):
    
    model.to(args.device)
    model.eval()
    
    eva_n_gold, eva_n_pred,eva_n_tp = 0, 0, 0
    
    for step, batch in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outs = model.generate(input_ids=batch['source_ids'].to(args.device), 
                                  attention_mask=batch['source_mask'].to(args.device), 
                                  max_length=args.max_seq_length)

        pred_seqs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        gold_seqs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]

        n_gold, n_pred, n_tp = compute_scores(pred_seqs, gold_seqs)
        eva_n_gold += n_gold
        eva_n_pred += n_pred
        eva_n_tp += n_tp
        
    precision = float(eva_n_tp) / float(eva_n_pred) if eva_n_pred != 0 else 0
    recall = float(eva_n_tp) / float(eva_n_gold) if eva_n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    f1 = torch.tensor(f1, dtype=torch.float32)
    
    print(f'n_gold:{eva_n_gold}; n_pred:{eva_n_pred}; n_tp:{eva_n_tp}')
        
    return f1

In [None]:
def train_function(args, seed):
    
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu' )

    print("\n", "=" * 30, "NEW train", "=" * 30, "\n")
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    # sanity check
    # show one sample to check the code and the expected output
    args.train = True
    print(f"Here is an example (from the train set):")
    train_dataset = ABSADataset(tokenizer=tokenizer, args = args,
                                input_max_len=args.input_max_len, target_max_len=args.target_max_len)
    data_sample = train_dataset[7]  # a random data sample
    print('Input :', tokenizer.decode(data_sample['source_ids'], skip_special_tokens=True))
    print('Output:', tokenizer.decode(data_sample['target_ids'], skip_special_tokens=True))
    
    args.train = False
    print(f"Here is an example (from the test set):")
    test_dataset = ABSADataset(tokenizer=tokenizer, args = args,
                                input_max_len=args.input_max_len, target_max_len=args.target_max_len)
    data_sample = test_dataset[7]  # a random data sample
    print('Input :', tokenizer.decode(data_sample['source_ids'], skip_special_tokens=True))
    print('Output:', tokenizer.decode(data_sample['target_ids'], skip_special_tokens=True))
    
    train_loader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, num_workers=4)

    # initialize the T5 model
    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path)

    # optimizer and lr scheduler
    optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_loader) * args.num_train_epochs),
    )

    # training process
    print("start training......")
    best_Score = -10000
    earlystop_count = 0

    model = model.to(args.device)

    for epoch in range(args.num_train_epochs):
        model.train()
        total_loss = 0

        for step, batch in enumerate(tqdm(train_loader)):

            lm_labels = batch["target_ids"]
            lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

            outputs = model(input_ids=batch["source_ids"].to(args.device),
                           attention_mask=batch["source_mask"].to(args.device),
                           labels=lm_labels.to(args.device),
                           decoder_attention_mask=batch['target_mask'].to(args.device))

            loss = outputs[0]
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        # validation
        f1 = validation(args, model, val_loader, tokenizer)

        current_Score = f1

        if current_Score > best_Score:
            print(f'bestF1:{f1}')
            best_Score = current_Score
            model_name  = f'main-multi-source-seed={seed}'
            model_name = args.output_dir + '/' + model_name
#                 tokenizer.save_pretrained(model_name)
#                 model.save_pretrained(model_name)
            torch.save(model.state_dict(), model_name)
            print("Save current best model in file:", model_name)
            earlystop_count = 0
        else:
            earlystop_count+= 1
            if earlystop_count == 10:
#                     print("Load best model from:", model_name)
#                     test_f1 = test(args, model_name, test_loader, tokenizer)
#                     print(f'Best Test F1:{test_f1}')
                break

        train_epoch_loss = total_loss / len(train_loader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"epoch={epoch}: train_ppl:{train_ppl};train_loss:{train_epoch_loss};F1:{f1}")

        torch.cuda.empty_cache()
    print("Finish training and saving the model!")

In [2]:
if __name__ == '__main__':

    seed_list = [1, 42, 1992, 2023, 2024]

    for each_seed in seed_list:
        # initialization
        args.seed = each_seed
        #order = " ".join(each)

        seed = args.seed  # random.randint(0, 1234)
        print("seed ", seed)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
#         cudnn.benchmark = False
#         torch.backends.cudnn.deterministic = True
        train_function(args, seed)

In [4]:
def test(args, load_path, dataloader, tokenizer):
    
    model = MyT5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
    
    model.load_state_dict(torch.load(load_path)) 
    
    model.to(args.device)
    model.eval()
    outputs, targets = [], []
    
    eva_n_gold, eva_n_pred,eva_n_tp = 0, 0, 0
    
    for batch in tqdm(dataloader):
        # need to push the data to device
        with torch.no_grad():
            outs = model.generate(input_ids=batch['source_ids'].to(args.device), 
                                  attention_mask=batch['source_mask'].to(args.device), 
                                  max_length=args.max_seq_length)

        pred_seqs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        gold_seqs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]
        outputs.extend(pred_seqs)
        targets.extend(gold_seqs)

        n_gold, n_pred, n_tp = compute_scores(pred_seqs, gold_seqs)
        eva_n_gold += n_gold
        eva_n_pred += n_pred
        eva_n_tp += n_tp
        
    precision = float(eva_n_tp) / float(eva_n_pred) if eva_n_pred != 0 else 0
    recall = float(eva_n_tp) / float(eva_n_gold) if eva_n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    f1 = torch.tensor(f1, dtype=torch.float32)
    
    print(f'n_gold:{eva_n_gold}; n_pred:{eva_n_pred}; n_tp:{eva_n_tp}')
    print(Span_Evalution(outputs, targets).Quad_Evaluation())
    
    return f1