In [4]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 31.1 MB/s eta 0:00:01[K     |██                              | 20 kB 37.9 MB/s eta 0:00:01[K     |███                             | 30 kB 38.4 MB/s eta 0:00:01[K     |████                            | 40 kB 25.8 MB/s eta 0:00:01[K     |█████                           | 51 kB 22.7 MB/s eta 0:00:01[K     |██████                          | 61 kB 25.9 MB/s eta 0:00:01[K     |███████                         | 71 kB 23.1 MB/s eta 0:00:01[K     |████████                        | 81 kB 24.3 MB/s eta 0:00:01[K     |█████████                       | 92 kB 26.1 MB/s eta 0:00:01[K     |██████████                      | 102 kB 26.8 MB/s eta 0:00:01[K     |███████████                     | 112 kB 26.8 MB/s eta 0:00:01[K     |████████████                    | 122 kB 26.8 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 26.8 MB/s eta

###util.py

In [None]:
from os.path import abspath, dirname


def get_root_dir():
    result = dirname(abspath(__file__))
    src = "/src"
    if result.endswith(src):
        result = result[:-len(src) + 1]
    return result

###load_data.py

In [None]:
"""
MIT License

Copyright (c) 2020 CLTL Leolani
Copyright (c) 2021 Junyin Chen

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import logging
import os
import random
from typing import Tuple

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer

import util

MELD_SPEAKER = ["Chandler", "Joey", "Monica", "Rachel", "Ross", "Phoebe"]


def set_seed(seed: int) -> None:
    """Set random seed to a fixed value.

    Set everything to be deterministic
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


def get_emotion2id(dataset: str) -> Tuple[dict, dict]:
    """Get a dict that converts string class to number."""

    if dataset == "MELD":
        # MELD has 7 classes
        emotions = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
        emotion2id = {emotion: idx for idx, emotion in enumerate(emotions)}
        id2emotion = {val: key for key, val in emotion2id.items()}
    return emotion2id, id2emotion


def get_speaker2id(dataset: str) -> Tuple[dict, dict]:
    """Get a dict that converts string class to number."""

    if dataset == "MELD":
        # MELD has 7 classes
        speakers = [
            "CHANDLER",
            "JOEY",
            "MONICA",
            "RACHEL",
            "ROSS",
            "PHOEBE",
            "OTHERS",
        ]
        speaker2id = {speaker: idx for idx, speaker in enumerate(speakers)}
        id2speaker = {val: key for key, val in speaker2id.items()}
    return speaker2id, id2speaker


class TextDataset(torch.utils.data.Dataset):
    def __init__(
            self,
            dataset="MELD",
            split="train",
            field="emotion",
            num_past_utterances=0,
            num_future_utterances=0,
            model_checkpoint="roberta-base",
            directory=util.get_root_dir() + "data/",
            up_to=False,
            seed=0
    ):
        """Initialize emotion recognition in conversation text modality dataset class."""

        self.dataset = dataset
        self.directory = directory
        self.split = split
        self.field = field
        self.num_past_utterances = num_past_utterances
        self.num_future_utterances = num_future_utterances
        self.model_checkpoint = model_checkpoint
        self.emotion2id, self.id2emotion = get_emotion2id(self.dataset)
        self.speaker2id, self.id2speaker = get_speaker2id(self.dataset)
        self.up_to = up_to
        self.seed = seed

        self._load_from_raw()
        self._string2tokens()

    def _load_from_raw(self):
        if self.dataset in ["MELD"]:
            # Get the full file path
            raw_path = os.path.join(self.directory, self.dataset, self.split + "_sent_emo.csv")
            # Load csv to pandas Dataframe
            raw_data = pd.read_csv(raw_path)
            # Load each field
            self.dialog_group = {}
            self.emotion = {}
            self.speaker_emotion = {}
            for _, row in raw_data.iterrows():
                utterance_id = '{}_{}_{}_{}'.format(row['Dialogue_ID'],
                                                    row['Utterance_ID'],
                                                    row['Season'],
                                                    row['Episode'])
                dialog_id = '{}_{}_{}'.format(row['Dialogue_ID'], row['Season'], row['Episode'])
                if dialog_id not in self.dialog_group:
                    self.dialog_group[dialog_id] = []
                self.dialog_group[dialog_id].append(utterance_id)
                self.emotion[utterance_id] = row['Emotion']
                utterance = row["Utterance"]
                speaker = "OTHERS"
                if row["Speaker"] in MELD_SPEAKER:
                    speaker = row["Speaker"].upper()

                self.speaker_emotion[utterance_id] = {"utterance": speaker + ": " + utterance,
                                                      "clean": utterance,
                                                      "emotion": row['Emotion'],
                                                      "speaker": speaker,
                                                      }
        else:
            raise ValueError(f"{self.dataset} is not MELD")


    def _create_input(
            self, diaids, num_past_utterances, num_future_utterances
    ):
        """Create an input which will be an input to RoBERTa."""

        args = {
            "diaids": diaids,
            "num_past_utterances": num_past_utterances,
            "num_future_utterances": num_future_utterances,
        }

        logging.debug(f"arguments given: {args}")
        tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, use_fast=True)
        max_model_input_size = tokenizer.max_model_input_sizes[self.model_checkpoint]
        num_truncated = 0

        inputs = []
        for diaid in tqdm(diaids):
            ues = [
                self.speaker_emotion[uttid]
                for uttid in self.dialog_group[diaid]
            ]

            num_tokens = [len(tokenizer(ue["utterance"])["input_ids"]) for ue in ues]

            for idx, ue in enumerate(ues):
                if ue["emotion"] not in list(self.emotion2id.keys()):
                    continue

                emotion = self.emotion2id[ue["emotion"]]
                speaker = self.speaker2id[ue["speaker"]]

                indexes = [idx]
                indexes_past = [
                    i for i in range(idx - 1, idx - num_past_utterances - 1, -1)
                ]
                indexes_future = [
                    i for i in range(idx + 1, idx + num_future_utterances + 1, 1)
                ]

                offset = 0
                if len(indexes_past) < len(indexes_future):
                    for _ in range(len(indexes_future) - len(indexes_past)):
                        indexes_past.append(None)
                elif len(indexes_past) > len(indexes_future):
                    for _ in range(len(indexes_past) - len(indexes_future)):
                        indexes_future.append(None)

                for i, j in zip(indexes_past, indexes_future):
                    if i is not None and i >= 0:
                        indexes.insert(0, i)
                        offset += 1
                        if (
                                sum([num_tokens[idx_] for idx_ in indexes])
                                > max_model_input_size
                        ):
                            del indexes[0]
                            offset -= 1
                            num_truncated += 1
                            break
                    if j is not None and j < len(ues):
                        indexes.append(j)
                        if (
                                sum([num_tokens[idx_] for idx_ in indexes])
                                > max_model_input_size
                        ):
                            del indexes[-1]
                            num_truncated += 1
                            break

                utterances = [ues[idx_]["utterance"] for idx_ in indexes]
                clean = [ues[idx_]["clean"] for idx_ in indexes]

                if num_past_utterances == 0 and num_future_utterances == 0:
                    assert len(utterances) == 1
                    final_utterance = clean[0]

                elif num_past_utterances > 0 and num_future_utterances == 0:
                    if len(utterances) == 1:
                        final_utterance = "</s></s>" + clean[-1]
                    else:
                        final_utterance = (
                                " ".join(utterances[:-1]) + "</s></s>" + clean[-1]
                        )

                elif num_past_utterances == 0 and num_future_utterances > 0:
                    if len(utterances) == 1:
                        final_utterance = clean[0] + "</s></s>"
                    else:
                        final_utterance = (
                                clean[0] + "</s></s>" + " ".join(utterances[1:])
                        )

                elif num_past_utterances > 0 and num_future_utterances > 0:
                    if len(utterances) == 1:
                        final_utterance = "</s></s>" + clean[0] + "</s></s>"
                    else:
                        final_utterance = (
                                " ".join(utterances[:offset])
                                + "</s></s>"
                                + clean[offset]
                                + "</s></s>"
                                + " ".join(utterances[offset + 1:])
                        )
                else:
                    raise ValueError

                input_ids_attention_mask = tokenizer(final_utterance)
                input_ids = input_ids_attention_mask["input_ids"]
                attention_mask = input_ids_attention_mask["attention_mask"]

                label = emotion
                if self.field == "speaker":
                    label = speaker

                input_ = {
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "label": label,
                }

                inputs.append(input_)

        logging.info(f"number of truncated utterances: {num_truncated}")
        return inputs

    def _string2tokens(self):
        """Convert string to (BPE) tokens."""
        logging.info(f"converting utterances into tokens ...")

        diaids = sorted(list(self.dialog_group.keys()))

        set_seed(self.seed)
        random.shuffle(diaids)

        if self.up_to:
            logging.info(f"Using only the first {self.up_to} dialogues ...")
            diaids = diaids[: self.up_to]

        logging.info(f"creating input utterance data ... ")
        self.inputs_ = self._create_input(
            diaids=diaids,
            num_past_utterances=self.num_past_utterances,
            num_future_utterances=self.num_future_utterances,
        )

    def __getitem__(self, index):
        return self.inputs_[index]

    def __len__(self):
        return len(self.inputs_)

###MTL/data.py

In [None]:


class SingleTaskDataLoader:
    def __init__(self, task, **kwargs):
        self.task = task
        self.data_loader = DataLoader(**kwargs)
        self.batch_size = self.data_loader.batch_size
        self.dataset = self.data_loader.dataset
        
    def __len__(self) -> int:
        return len(self.data_loader)
    
    def __iter__(self):
        for batch in self.data_loader:
            batch["task"] = self.task
            yield batch
    
class MultiTaskDataLoader:
    def __init__(self, task_data_loaders):
        self.task_data_loaders = task_data_loaders
        self.dataset = [None] * sum([len(dl.dataset) for dl in task_data_loaders.values()])
        
    def __len__(self) -> int:
        return sum([len(dl) for dl in self.task_data_loaders.values()])
    
    def __iter__(self):
        task_choices = []
        for task, dl in self.task_data_loaders.items():
            task_choices.extend([task] * len(dl))
        task_choices = np.array(task_choices)
        np.random.shuffle(task_choices)
        for task in task_choices:
            yield next(iter(self.task_data_loaders[task]))

###MTL/model.py

In [None]:


class MultiTaskModel(PreTrainedModel):
    def __init__(self, encoder, task_models):
        super(MultiTaskModel, self).__init__(PretrainedConfig())
        self.encoder = encoder
        self.task_models = nn.ModuleDict(task_models)
        
    @classmethod
    def from_task_models(cls, task_models):
        shared_encoder = None
        for model in task_models.values():
            if shared_encoder is None:
                shared_encoder = getattr(model, cls.get_encoder_attr_name(model))
            else:
                setattr(model, cls.get_encoder_attr_name(model), shared_encoder)
        return cls(shared_encoder, task_models)
                  
    @staticmethod
    def get_encoder_attr_name(model):
        model_name = model.__class__.__name__
        if model_name.startswith('Bert'):
            return 'bert'
        elif model_name.startswith('Roberta'):
            return 'roberta'
        elif model_name.startswith('Albert'):
            return 'albert'
        else:
            raise ValueError('Unsupported model: {}'.format(model_name))
        
    def forward(self, task, input_ids, attention_mask, **kwargs):
        model = self.task_models[task]
        return model(input_ids, attention_mask, **kwargs)

###MLT/train.py

In [None]:


class MultiTaskTrainer(Trainer):

    def get_single_task_dataloader(self, task, dataset, description):
        if description == "training" and self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        elif description == "evaluation" and dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        if is_datasets_available() and isinstance(dataset, datasets.Dataset):
            dataset = self._remove_unused_columns(dataset, description=description)

        if isinstance(dataset, torch.utils.data.IterableDataset):
            if self.args.world_size > 1:
                dataset = IterableDatasetShard(
                    dataset,
                    batch_size=self.args.train_batch_size,
                    drop_last=self.args.dataloader_drop_last,
                    num_processes=self.args.world_size,
                    process_index=self.args.process_index,
                )

            return SingleTaskDataLoader(
                task,
                dataset=dataset,
                batch_size=self.args.per_device_train_batch_size,
                collate_fn=self.data_collator,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
        
        if description == "training":
            self.train_dataset, dataset = dataset, self.train_dataset
            sampler = self._get_train_sampler()
            self.train_dataset, dataset = dataset, self.train_dataset
            batch_size = self.args.train_batch_size
        else:
            sampler = self._get_eval_sampler(dataset)
            batch_size = self.args.eval_batch_size
            
        return SingleTaskDataLoader(
            task,
            dataset=dataset,
            batch_size=batch_size,
            sampler=sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )


    def get_train_dataloader(self):
        return MultiTaskDataLoader({
            task: self.get_single_task_dataloader(task, dataset, description="training")
            for task, dataset in self.train_dataset.items()
        })
        
    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        task_to_eval = eval_dataset["task"]
        return self.get_single_task_dataloader(task_to_eval, eval_dataset[task_to_eval], description="evaluation")
        
    
    def get_test_dataloader(self, test_dataset):
        task_to_test = test_dataset["task"]
        return self.get_single_task_dataloader(task_to_test, test_dataset[task_to_test], description="test")


###pipeline.py

In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import datasets
from datasets import load_dataset
from datasets import ClassLabel, load_metric

from transformers import Trainer
from transformers import is_datasets_available
from transformers import PreTrainedModel, PretrainedConfig
from transformers.trainer_pt_utils import IterableDatasetShard
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments

from load_data import TextDataset
import util

In [2]:
args = vars(util.get_args())

train_dataset = {
        "emotion": TextDataset(
            dataset="MELD",
            split="train",
            field="emotion",
            seed=args["seed"],
            directory=args["data_dir"],
            num_past_utterances=args["num_past_utterances"],
            num_future_utterances=args["num_future_utterances"],
        ),
        "speaker": TextDataset(
            dataset="MELD",
            split="train",
            field="speaker",
            seed=args["seed"],
            directory=args["data_dir"],
            num_past_utterances=args["num_past_utterances"],
            num_future_utterances=args["num_future_utterances"],
        )
    }
eval_dataset = {"emotion": TextDataset(
      dataset="MELD",
      split="dev",
      field="emotion",
      seed=args["seed"],
      directory=args["data_dir"],
      num_past_utterances=args["num_past_utterances"],
      num_future_utterances=args["num_future_utterances"],
  ), "speaker": TextDataset(
        dataset="MELD",
        split="dev",
        field="speaker",
        seed=args["seed"],
        directory=args["data_dir"],
        num_past_utterances=args["num_past_utterances"],
        num_future_utterances=args["num_future_utterances"],
    ), "task": args["evaluation"]}

checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

labels = {
        "speaker": ClassLabel(num_classes=7,
                              names=["Chandler", "Joey", "Monica", "Rachel", "Ross", "Phoebe", "Others"]),
        "emotion": ClassLabel(num_classes=7,
                              names=["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"])
    }

usage: ipykernel_launcher.py [-h] [--seed SEED]
                             [--num_past_utterances NUM_PAST_UTTERANCES]
                             [--num_future_utterances NUM_FUTURE_UTTERANCES]
                             [--data_dir DATA_DIR] [--do_train]
                             [--evaluation EVALUATION] [--train_dir TRAIN_DIR]
                             [--model_file MODEL_FILE] [--epoch EPOCH]
                             [--learning_rate LEARNING_RATE]
                             [--batch_size BATCH_SIZE]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/junyinchen/Library/Jupyter/runtime/kernel-6c614347-4ef4-4d55-894b-622fe3216e25.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
single_task_models = {
        task: AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=label.num_classes)
        for task, label in labels.items()
    }
multi_task_model = MultiTaskModel.from_single_task_models(single_task_models)


Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
def compute_metrics(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

training_args = TrainingArguments(
    output_dir="../outputs/multi_task_model",
    overwrite_output_dir=True,
    label_names=["labels"],
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,  
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    load_best_model_at_end=True
)

trainer = MultiTaskTrainer(
    multi_task_model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

do_train = True

if do_train:
    trainer.train()

***** Running training *****
  Num examples = 19978
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7494


Epoch,Training Loss,Validation Loss,F1
1,0.9307,1.194585,0.577457
2,0.6912,1.299061,0.571411
3,0.5617,1.338199,0.582055


***** Running Evaluation *****
  Num examples = 1109
  Batch size = 8


Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Saving model checkpoint to ../outputs/multi_task_model/checkpoint-2498
Configuration saved in ../outputs/multi_task_model/checkpoint-2498/config.json
Model weights saved in ../outputs/multi_task_model/checkpoint-2498/pytorch_model.bin
tokenizer config file saved in ../outputs/multi_task_model/checkpoint-2498/tokenizer_config.json
Special tokens file saved in ../outputs/multi_task_model/checkpoint-2498/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1109
  Batch size = 8
Saving model checkpoint to ../outputs/multi_task_model/checkpoint-4996
Configuration saved in ../outputs/multi_task_model/checkpoint-4996/config.json
Model weights saved in ../outputs/multi_task_model/checkpoint-4996/pytorch_model.bin
tokenizer config file saved in ../outputs/multi_task_model/checkpoint-4996/tokenizer_config.json
Special tokens file saved in ../outputs/multi_task_model/checkpoint-4996/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1109
  Batch size = 8
Sa

In [None]:
f1 = trainer.predict(test_dataset).metrics['test_f1']
print("Weighted F1:", f1)

***** Running Prediction *****
  Num examples = 2610
  Batch size = 8


Weighted F1: 0.6040721346150377
