In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2
!python --version

Python 3.10.12


In [None]:
!pip install -r /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/requirements.txt

Collecting transformers (from -r /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/requirements.txt (line 2))
  Downloading transformers-4.35.1-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting attrdictionary (from -r /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/requirements.txt (line 3))
  Downloading attrdictionary-1.0.0-py2.py3-none-any.whl (8.0 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers->-r /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/requirements.txt (line 2))
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers->-r /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/requirements.txt (line 2))
  Downloading tokenizers-0.14.1-cp310-cp

In [None]:
%cd /content/drive/MyDrive/711_as3/GoEmotions-pytorch-master/

/content/drive/MyDrive/711_as3/GoEmotions-pytorch-master


In [None]:
import argparse
import json
import logging
import os
import glob

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from attrdictionary import AttrDict

from transformers import (
    BertConfig,
    BertTokenizer,
    AdamW,
    get_linear_schedule_with_warmup
)

from model import BertForMultiLabelClassification
from utils import (
    init_logger,
    set_seed,
    compute_metrics
)
from data_loader import (
    load_and_cache_examples,
    GoEmotionsProcessor
)

In [None]:
logger = logging.getLogger(__name__)


In [None]:
def train(args,
          model,
          tokenizer,
          train_dataset,
          dev_dataset=None,
          test_dataset=None):
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(t_total * args.warmup_proportion),
        num_training_steps=t_total
    )

    # if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
    #         os.path.join(args.model_name_or_path, "scheduler.pt")
    # ):
    #     # Load optimizer and scheduler states
    #     optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    #     scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total train batch size = %d", args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info("  Logging steps = %d", args.logging_steps)
    logger.info("  Save steps = %d", args.save_steps)

    global_step = 0
    # tr_loss = 0.0
    best_f1 = 0.0
    best_model = None
    epoch_loss = []
    best_epoch = 0

    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    for i in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        model.train()
        tot_loss = 0.0
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[3]
            }

            outputs = model(**inputs)
            loss = outputs[0]

            # if args.gradient_accumulation_steps > 1:
            #     loss = loss / args.gradient_accumulation_steps

            loss.backward()
            tot_loss += loss.item()
            # if (step + 1) % args.gradient_accumulation_steps == 0 or (
            #         len(train_dataloader) <= args.gradient_accumulation_steps
            #         and (step + 1) == len(train_dataloader)
            # ):
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

            # if args.logging_steps > 0 and global_step % args.logging_steps == 0:
            #     if args.evaluate_test_during_training:
            #         evaluate(args, model, test_dataset, "test", global_step)
            #     else:
            #         evaluate(args, model, dev_dataset, "dev", global_step)

        epoch_loss.append(tot_loss / len(epoch_iterator))
        # evaluate on dev set
        # if args.save_steps > 0 and global_step % args.save_steps == 0:
        results = evaluate(args, model, dev_dataset, "dev", global_step)
        # upd best model
        if results["macro_f1"] > best_f1:
            best_f1 = results["macro_f1"]
            best_model = model
            best_epoch = i
        # save after each epoch
        output_dir = os.path.join(args.output_dir, "epoch_{}".format(i))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to {}".format(output_dir))


    # Save best model
    output_dir = os.path.join(args.output_dir, "best_epoch_{}".format(best_epoch))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = (
        best_model.module if hasattr(best_model, "module") else best_model
    )
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(args, os.path.join(output_dir, "training_args.bin"))

    logger.info("Saving model checkpoint to {}".format(output_dir))

        # if args.save_optimizer:
        #     torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        #     torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        #     logger.info("Saving optimizer and scheduler states to {}".format(output_dir))

            # if args.max_steps > 0 and global_step > args.max_steps:
            #     break

        # if args.max_steps > 0 and global_step > args.max_steps:
        #     break

    return epoch_loss




In [None]:
def evaluate(args, model, eval_dataset, mode, global_step=None):
    results = {}
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    if global_step != None:
        logger.info("***** Running evaluation on {} dataset ({} step) *****".format(mode, global_step))
    else:
        logger.info("***** Running evaluation on {} dataset *****".format(mode))
    logger.info("  Num examples = {}".format(len(eval_dataset)))
    logger.info("  Eval Batch size = {}".format(args.eval_batch_size))
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[3]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = 1 / (1 + np.exp(-logits.detach().cpu().numpy()))  # Sigmoid
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, 1 / (1 + np.exp(-logits.detach().cpu().numpy())), axis=0)  # Sigmoid
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    results = {
        "loss": eval_loss
    }
    preds[preds > args.threshold] = 1
    preds[preds <= args.threshold] = 0
    result = compute_metrics(out_label_ids, preds)
    results.update(result)

    # output_dir = os.path.join(args.output_dir, mode)
    # if not os.path.exists(output_dir):
    #     os.makedirs(output_dir)

    # output_eval_file = os.path.join(output_dir, "{}-{}.txt".format(mode, global_step) if global_step else "{}.txt".format(mode))
    # with open(output_eval_file, "w") as f_w:
    #     logger.info("***** Eval results on {} dataset *****".format(mode))
    #     for key in sorted(results.keys()):
    #         logger.info("  {} = {}".format(key, str(results[key])))
    #         f_w.write("  {} = {}\n".format(key, str(results[key])))
    return results

In [None]:
def write_result(output_eval_file, label_list, id2label, results):
    # std for precision
    precision_std = np.std(results["precision_arr"])
    recall_std = np.std(results["recall_arr"])
    f1_std = np.std(results["f1_arr"])

    with open(output_eval_file, "w") as f_w:
        # for key in sorted(results.keys()):
        #     f_w.write("{} = {}\n".format(key, str(results[key])))
        for i in range(len(label_list)):
            f_w.write("{}: {} {} {}\n".format(id2label[str(i)], str(results["precision_arr"][i]), str(results["recall_arr"][i]), str(results["f1_arr"][i])))
        f_w.write("{}: {} {} {}\n".format("macro-average", str(results["macro_precision"]), str(results["macro_recall"]), str(results["macro_f1"])))
        f_w.write("{}: {} {} {}\n".format("std", str(precision_std), str(recall_std), str(f1_std)))


In [None]:
def main(corpus_dict):
    # Read from config file and make args
    config_filename = "{}.json".format(corpus_dict["taxonomy"])
    with open(os.path.join("config", config_filename)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

    init_logger()
    set_seed(args)

    processor = GoEmotionsProcessor(args)
    label_list = processor.get_labels()

    id2label={str(i): label for i, label in enumerate(label_list)}

    config = BertConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=len(label_list),
        finetuning_task=args.task,
        id2label={str(i): label for i, label in enumerate(label_list)},
        label2id={label: i for i, label in enumerate(label_list)}
    )

    tokenizer = BertTokenizer.from_pretrained(
        args.tokenizer_name_or_path,
    )
    model = BertForMultiLabelClassification.from_pretrained(
        args.model_name_or_path,
        config=config
    )

    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
    model.to(args.device)

    # Load dataset
    train_dataset = load_and_cache_examples(args, tokenizer, mode="train") if args.train_file else None
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") if args.dev_file else None
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test") if args.test_file else None

    epoch_losses = None
    if args.do_train:
        epoch_losses = train(args, model, tokenizer, train_dataset, dev_dataset, test_dataset)
        # logger.info(" global_step = {}, average loss = {}".format(global_step, tr_loss))

    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + "training_args.bin", recursive=True))
        )
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(logging.WARN)  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            global_step = checkpoint.split("_")[-1]
            model = BertForMultiLabelClassification.from_pretrained(checkpoint)
            model.to(args.device)
            results = evaluate(args, model, test_dataset, mode="test", global_step=global_step)
            # result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            # results.append(result)
            output_eval_file = os.path.join(args.output_dir, "{}_eval_results.txt".format(str(global_step)))
            write_result(output_eval_file, label_list, id2label, results)


        # with open(output_eval_file, "w") as f_w:
        #     # for key in sorted(results.keys()):
        #     #     f_w.write("{} = {}\n".format(key, str(results[key])))
        #     for i in range(len(label_list)):
        #         f_w.write("{}: {} {} {}\n".format(id2label[str(i)], str(results["precision_arr"][i]), str(results["recall_arr"][i]), str(results["f1_arr"][i])))
        #     f_w.write("{}: {} {} {}\n".format("macro-average", str(results["macro_precision"]), str(results["macro_recall"]), str(results["macro_f1"])))

    return epoch_losses

In [None]:
corpus_dict = {"taxonomy": "ekman"}

epoch_losses = main(corpus_dict)
# print(results)

Downloading (…)okenizer_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/182 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration:  46%|████▌     | 1248/2714 [01:16<01:28, 16.65it/s][A
Iteration:  46%|████▌     | 1250/2714 [01:17<01:28, 16.50it/s][A
Iteration:  46%|████▌     | 1252/2714 [01:17<01:28, 16.52it/s][A
Iteration:  46%|████▌     | 1254/2714 [01:17<01:28, 16.50it/s][A
Iteration:  46%|████▋     | 1256/2714 [01:17<01:28, 16.48it/s][A
Iteration:  46%|████▋     | 1258/2714 [01:17<01:28, 16.52it/s][A
Iteration:  46%|████▋     | 1260/2714 [01:17<01:27, 16.61it/s][A
Iteration:  46%|████▋     | 1262/2714 [01:17<01:27, 16.64it/s][A
Iteration:  47%|████▋     | 1264/2714 [01:17<01:27, 16.63it/s][A
Iteration:  47%|████▋     | 1266/2714 [01:17<01:26, 16.74it/s][A
Iteration:  47%|████▋     | 1268/2714 [01:18<01:26, 16.77it/s][A
Iteration:  47%|████▋     | 1270/2714 [01:18<01:26, 16.77it/s][A
Iteration:  47%|████▋     | 1272/2714 [01:18<01:25, 16.78it/s][A
Iteration:  47%|████▋     | 1274/2714 [01:18<01:25, 16.78it/s][A
Iteration: 

In [None]:
!ls

ckpt	data_loader.py	multilabel_pipeline.py	requirements.txt   utils.py
config	LICENSE		__pycache__		run_goemotions
data	model.py	README.md		run_goemotions.py


In [None]:
epoch_losses = [0.28836443189836286,
 0.21052542798270527,
 0.16352840849969127,
 0.11153157365933203,
 0.07718892126239825,
 0.054478380670472455,
 0.03927105147317436,
 0.027134557569028182,
 0.016327834665628944,
 0.008877260763111577]

In [None]:
with open("ckpt/ekman/bert-base-cased-goemotions-ekman/losses.txt", "w") as f_w:
    for loss in epoch_losses:
        f_w.write("{}\n".format(str(loss)))
