In [1]:
import os
import fire
import argparse

import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import (
    Dataset, 
    DataLoader
)

import torch.nn.functional as F
from tqdm import tqdm, trange

from transformers import (
    RobertaTokenizer,
    set_seed
)

# Config

In [2]:
FLAG = False
MIXUP_START = 10
LAMBDA = 0.5

INPUT_COLUMN = 'text'
DATA_COLUMN = 'category'
OUTPUT_COLUMN = 'label'

NUM_EPOCHS = 2
MAX_LEN = 256
BATCH_SIZE = 32

print("INPUT_COLUMN: ", INPUT_COLUMN)
print("DATA_COLUMN: ", DATA_COLUMN)
print("OUTPUT_COLUMN: ", OUTPUT_COLUMN)
print("NUM_EPOCHS: ", NUM_EPOCHS)
print("MAX_LEN: ", MAX_LEN)
print("BATCH_SIZE: ", BATCH_SIZE)
print("LAMBDA: ", LAMBDA)
print("FLAG: ", FLAG)
print("MIXUP_START: ", MIXUP_START)

INPUT_COLUMN:  text
DATA_COLUMN:  category
OUTPUT_COLUMN:  label
NUM_EPOCHS:  2
MAX_LEN:  256
BATCH_SIZE:  32
LAMBDA:  0.5
FLAG:  False
MIXUP_START:  10


# Mixup Roberta

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List, Optional, Tuple, Union

from transformers.models.roberta.modeling_roberta import (
    RobertaPreTrainedModel,
    RobertaLayer,
    RobertaEmbeddings,
    RobertaPooler,
    RobertaClassificationHead
)

from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    SequenceClassifierOutput
)

# from config import *


class RobertaMixerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        next_decoder_cache = () if use_cache else None
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            if self.gradient_checkpointing and self.training:

                if use_cache:
                    logger.warning(
                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                    )
                    use_cache = False

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, past_key_value, output_attentions)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(layer_module),
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                )
            else:
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            hidden_states = layer_outputs[0]
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )


class RobertaMixerModel(RobertaPreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.
    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    """

    _keys_to_ignore_on_load_missing = [r"position_ids"]

    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = RobertaEmbeddings(config)
        self.encoder = RobertaMixerEncoder(config)

        self.pooler = RobertaPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)


            
    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if self.config.is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
        else:
            use_cache = False

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)

        if token_type_ids is None:
            if hasattr(self.embeddings, "token_type_ids"):
                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length,
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )



class RobertaMixerForSequenceClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.roberta = RobertaMixerModel(config, add_pooling_layer=False)
        
        if FLAG:
            self.mixup_dense = nn.Linear(config.hidden_size, config.hidden_size)
            self.mixup_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
            self.mixup_dropout = nn.Dropout(config.hidden_dropout_prob)

        self.classifier = RobertaClassificationHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    
    
    def forward(
        self,
        input_ids_1: Optional[torch.LongTensor] = None,
        attention_mask_1: Optional[torch.FloatTensor] = None,
        input_ids_2: Optional[torch.LongTensor] = None,
        attention_mask_2: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels_1: Optional[torch.LongTensor] = None,
        labels_2: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs_1 = self.roberta(
            input_ids_1,
            attention_mask=attention_mask_1,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output_1 = outputs_1[0]
        
        # Mixup train
        if (input_ids_2 is not None) and (attention_mask_2 is not None) and (labels_2 is not None):
            
            outputs_2 = self.roberta(
                input_ids_2,
                attention_mask=attention_mask_2,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            sequence_output_2 = outputs_2[0]

            sequence_output = (LAMBDA * sequence_output_1) + ((1 - LAMBDA) * sequence_output_2)

            if FLAG:
                sequence_output = self.mixup_dense(sequence_output)
                sequence_output = self.mixup_layernorm(sequence_output)
                sequence_output = self.mixup_dropout(sequence_output)

            logits = self.classifier(sequence_output)

            loss = None
            loss_fct = nn.CrossEntropyLoss()
            loss = (LAMBDA * loss_fct(logits.view(-1, self.num_labels), labels_1.view(-1))) + ((1 - LAMBDA) * loss_fct(logits.view(-1, self.num_labels), labels_2.view(-1)))

        # Mixup eval
        else:
            logits = self.classifier(sequence_output_1)
            loss = None
            if labels_1 is not None:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels_1.view(-1))
         
        # Return logits, loss, and hidden states
        if not return_dict:
            output = (logits,) + outputs_1[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs_1.hidden_states,
            attentions=outputs_1.attentions,
        )

## MixupDataset

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [5]:
# def prepare_dataset(data, info_data=None, mixup=False):
    
#     # Remove none and hard examples
#     data = data[(data['category'] != 'none') & (data['category'] != 'hard')].reset_index(drop=True)

#     # Add softmax and entropy info
#     if (info_data is not None) and mixup:
#         data = pd.merge(data, info_data, on='idx')[['idx', 'text', 'label', 'category', 'softmax', 'entropy']]
#         mixup_size = len(info_data) - len(data)

#         # --------------------------------------- Same class mixup ---------------------------------------  

#         # Easy-Easy Mixup
#         easy_data = data[data['category'] == 'easy']
#         easy_low_ent_idx = easy_data.sort_values('entropy', ascending=True).head(mixup_size//3)['idx'].tolist()
#         easy_high_ent_idx = easy_data.sort_values('entropy', ascending=False).head(mixup_size//3)['idx'].tolist()
        
#         easy_mixup_data = easy_data[easy_data['idx'].isin(easy_low_ent_idx)].reset_index(drop=True)
#         random.shuffle(easy_high_ent_idx)
#         easy_mixup_data['idx_2'] = easy_high_ent_idx
#         easy_mixup_data['text_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['text'].values[0])
#         easy_mixup_data['label_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['label'].values[0])
#         easy_mixup_data['category_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['category'].values[0])
#         easy_mixup_data['mixup_type'] = 'same_easy'
        
#         # Ambi-Ambi Mixup
#         ambiguous_data = data[data['category'] == 'ambiguous']
#         ambiguous_low_ent_idx = ambiguous_data.sort_values('entropy', ascending=True).head(mixup_size//3)['idx'].tolist()
#         ambiguous_high_ent_idx = ambiguous_data.sort_values('entropy', ascending=False).head(mixup_size//3)['idx'].tolist()
        
#         ambiguous_mixup_data = ambiguous_data[ambiguous_data['idx'].isin(ambiguous_low_ent_idx)].reset_index(drop=True)
#         random.shuffle(ambiguous_high_ent_idx)
#         ambiguous_mixup_data['idx_2'] = ambiguous_high_ent_idx
#         ambiguous_mixup_data['text_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['text'].values[0])
#         ambiguous_mixup_data['label_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['label'].values[0])
#         ambiguous_mixup_data['category_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['category'].values[0])
#         ambiguous_mixup_data['mixup_type'] = 'same_ambiguous'
        
#         same_mixup_data = pd.concat([easy_mixup_data, ambiguous_mixup_data])
        
#         # --------------------------------------- Different class mixup ---------------------------------------  
        
#         # Random easy-ambi mixup
#         different_samples = mixup_size - len(same_mixup_data)
#         easy_tuple = list(zip(easy_data['idx'].tolist(), easy_data['text'].tolist(), easy_data['label'].tolist(), easy_data['category'].tolist()))
#         ambiguous_tuple = list(zip(ambiguous_data['idx'].tolist(), ambiguous_data['text'].tolist(), ambiguous_data['label'].tolist(), ambiguous_data['category'].tolist()))
        
#         easy_data = easy_data.sample(n=different_samples//2).reset_index(drop=True)
#         ambiguous4easy = random.choices(ambiguous_tuple, weights=np.ones(len(ambiguous_tuple)), k=different_samples//2)
#         ambiguous4easy = pd.DataFrame(ambiguous4easy, columns=['idx_2', 'text_2', 'label_2', 'category_2'])
#         ambiguous4easy = pd.concat([easy_data, ambiguous4easy], axis=1).reset_index(drop=True)
#         ambiguous4easy['mixup_type'] = 'ambiguous_easy'
        
#         ambiguous_data = ambiguous_data.sample(n=different_samples//2).reset_index(drop=True)
#         easy4ambiguous = random.choices(easy_tuple, weights=np.ones(len(easy_tuple)), k=different_samples//2)
#         easy4ambiguous = pd.DataFrame(easy4ambiguous, columns=['idx_2', 'text_2', 'label_2', 'category_2'])
#         easy4ambiguous = pd.concat([ambiguous_data, easy4ambiguous], axis=1).reset_index(drop=True)
#         easy4ambiguous['mixup_type'] = 'easy_ambiguous'
        
#         return pd.concat([same_mixup_data, easy4ambiguous, ambiguous4easy]).sample(frac=1).reset_index(drop=True)

#     return data

In [6]:
def prepare_dataset_original(data, include_none=False):
    if include_none:
        return data[data['category'] != 'hard'].reset_index(drop=True)
    else:
        return data[(data['category'] != 'none') & (data['category'] != 'hard')].sample(frac=1).reset_index(drop=True)

In [7]:
def get_label_data(df, label=0, use_entropy=False):
    df_label = df[df['label'] == label].reset_index(drop=True)
    if use_entropy:
        df_label = df_label.sort_values('entropy', ascending=True).reset_index(drop=True)
        temp_label = df_label.sort_values('entropy', ascending=False).reset_index(drop=True)
        temp_label = temp_label.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2', 'softmax': 'softmax_2', 'entropy': 'entropy_2'})
        return pd.concat([df_label, temp_label], axis=1).reset_index(drop=True)
    else:
        temp_label = df_label.sample(frac=1).reset_index(drop=True)
        temp_label = temp_label.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})   
        return pd.concat([df_label, temp_label], axis=1).reset_index(drop=True)

In [8]:
def prepare_dataset_random_mixup(data, info_data=None, include_none=False, use_label=False):
    
    if include_none:
        data = data[data['category'] != 'hard'].reset_index(drop=True)
    else:
        data = data[(data['category'] != 'none') & (data['category'] != 'hard')].reset_index(drop=True)
    
    data_len = len(data)
    
    if use_label:
        data_0 = get_label_data(data, label=0)
        data_1 = get_label_data(data, label=1)
        final_data = pd.concat([data_0, data_1]).reset_index(drop=True)
        
    else:
        temp = data.copy()
        temp = temp.sample(frac=1).reset_index(drop=True)
        temp = temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
        final_data = pd.concat([data, temp], axis=1)
        
    random_subset_1 = data.sample(n=data_len-len(final_data)).reset_index(drop=True)
    random_subset_2 = data.sample(n=data_len-len(final_data)).reset_index(drop=True)
    random_subset_2 = random_subset_2.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
    random_subset = pd.concat([random_subset_1, random_subset_2], axis=1).reset_index(drop=True)  

    return pd.concat([final_data, random_subset]).sample(frac=1).reset_index(drop=True)

In [9]:
def prepare_dataset_category_mixup(data, info_data=None, include_none=False, use_label=False, use_entropy=False):
    data_len = len(data)
    
    if include_none:
        data = data[data['category'] != 'hard'].reset_index(drop=True)
    else:
        data = data[(data['category'] != 'none') & (data['category'] != 'hard')].reset_index(drop=True)
    
    if use_entropy: 
        data = pd.merge(data, info_data, on='idx')[['idx', 'text', 'label', 'category', 'softmax', 'entropy']]

        if use_label:
            easy_data = data[data['category'] == 'easy'].reset_index(drop=True)
            easy_data_0 = get_label_data(easy_data, label=0, use_entropy=True)
            easy_data_1 = get_label_data(easy_data, label=1, use_entropy=True)
            easy_data  = pd.concat([easy_data_0, easy_data_1]).reset_index(drop=True)
            easy_data['mixup_type'] = 'same_easy'

            # Ambi-Ambi mixup
            ambiguous_data = data[data['category'] == 'ambiguous'].reset_index(drop=True)
            ambiguous_data_0 = get_label_data(ambiguous_data, label=0, use_entropy=True)
            ambiguous_data_1 = get_label_data(ambiguous_data, label=1, use_entropy=True)
            ambiguous_data  = pd.concat([ambiguous_data_0, ambiguous_data_1]).reset_index(drop=True)
            ambiguous_data['mixup_type'] = 'same_ambiguous'

            final_data = pd.concat([easy_data, ambiguous_data]).sample(frac=1).reset_index(drop=True)
            
            if include_none:
                # none-none Mixup
                none_data = data[data['category'] == 'none'].reset_index(drop=True)
                none_data_0 = get_label_data(none_data, label=0, use_entropy=True)
                none_data_1 = get_label_data(none_data, label=1, use_entropy=True)
                none_data  = pd.concat([none_data_0, none_data_1]).reset_index(drop=True)
                none_data['mixup_type'] = 'same_none'
                
                final_data = pd.concat([final_data, none_data]).sample(frac=1).reset_index(drop=True)
            
            return final_data


#             # Easy-Ambi Mixup
#             easy_ambiguous_len = data_len - len(final_data)

#             easy_0 = easy_data_0.head(min(easy_ambiguous_len//2, len(easy_data_0), len(ambiguous_data_0)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_0 = ambiguous_data_0.tail(min(easy_ambiguous_len//2, len(easy_data_0), len(ambiguous_data_0)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_0 = ambiguous_0.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
#             easy_ambiguous_0 = pd.concat([easy_0, ambiguous_0], axis=1).reset_index(drop=True)

#             easy_1 = easy_data_1.head(min(easy_ambiguous_len//2, len(easy_data_1), len(ambiguous_data_1)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_1 = ambiguous_data_1.tail(min(easy_ambiguous_len//2, len(easy_data_1), len(ambiguous_data_1)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_1 = ambiguous_1.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})

#             easy_ambiguous_1 = pd.concat([easy_1, ambiguous_1], axis=1).reset_index(drop=True)
#             easy_ambiguous_data = pd.concat([easy_ambiguous_0, easy_ambiguous_1]).reset_index(drop=True)
#             easy_ambiguous_data['mixup_type'] = 'easy_ambiguous'

#             return pd.concat([final_data, easy_ambiguous_data]).sample(frac=1).reset_index(drop=True)
            
        else:
            # Easy-Easy Mixup
            easy_data = data[data['category'] == 'easy'].reset_index(drop=True)
            easy_data = easy_data.sort_values('entropy', ascending=True).reset_index(drop=True)
            easy_temp = easy_data.sort_values('entropy', ascending=False).reset_index(drop=True)
            easy_temp = easy_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2', 'softmax': 'softmax_2', 'entropy': 'entropy_2'})
            easy_data = pd.concat([easy_data, easy_temp], axis=1).reset_index(drop=True)
            easy_data['mixup_type'] = 'same_easy'

            # Ambi-Ambi mixup
            ambiguous_data = data[data['category'] == 'ambiguous'].reset_index(drop=True)
            ambiguous_data = ambiguous_data.sort_values('entropy', ascending=True).reset_index(drop=True)
            ambiguous_temp = ambiguous_data.sort_values('entropy', ascending=False).reset_index(drop=True)
            ambiguous_temp = ambiguous_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2', 'softmax': 'softmax_2', 'entropy': 'entropy_2'})
            ambiguous_data = pd.concat([ambiguous_data, ambiguous_temp], axis=1).reset_index(drop=True)
            ambiguous_data['mixup_type'] = 'same_ambiguous'

            final_data = pd.concat([easy_data, ambiguous_data]).sample(frac=1).reset_index(drop=True)

            if include_none:
                # none-none Mixup
                none_data = data[data['category'] == 'none'].reset_index(drop=True)
                none_data = none_data.sort_values('entropy', ascending=True).reset_index(drop=True)
                none_temp = none_data.sort_values('entropy', ascending=False).reset_index(drop=True)
                none_temp = none_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2', 'softmax': 'softmax_2', 'entropy': 'entropy_2'})
                none_data = pd.concat([none_data, none_temp], axis=1).reset_index(drop=True)
                none_data['mixup_type'] = 'same_none'
                
                final_data = pd.concat([final_data, none_data]).sample(frac=1).reset_index(drop=True)
            
            return final_data

#             # Easy-Ambi Mixup
#             easy_ambiguous_len = data_len - len(final_data)
#             easy_ambiguous_data = easy_data.head(min(easy_ambiguous_len, len(easy_data)))[['idx', 'text', 'label', 'category', 'softmax', 'entropy']].reset_index(drop=True)

#             easy_ambiguous_temp = ambiguous_data.tail(min(easy_ambiguous_len, len(ambiguous_data)))[['idx', 'text', 'label', 'category', 'softmax', 'entropy']].reset_index(drop=True)
#             easy_ambiguous_temp = easy_ambiguous_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2', 'softmax': 'softmax_2', 'entropy': 'entropy_2'})

#             easy_ambiguous_data = pd.concat([easy_ambiguous_data, easy_ambiguous_temp], axis=1)
#             easy_ambiguous_data['mixup_type'] = 'easy_ambiguous'

#             return pd.concat([final_data, easy_ambiguous_data]).sample(frac=1).reset_index(drop=True)

    
    else:
        if use_label:
            # Easy-Easy mixup
            easy_data = data[data['category'] == 'easy'].reset_index(drop=True)
            easy_data_0 = get_label_data(easy_data, label=0)
            easy_data_1 = get_label_data(easy_data, label=1)
            easy_data  = pd.concat([easy_data_0, easy_data_1]).reset_index(drop=True)
            easy_data['mixup_type'] = 'same_easy'
            
            # Ambi-Ambi mixup
            ambiguous_data = data[data['category'] == 'ambiguous'].reset_index(drop=True)
            ambiguous_data_0 = get_label_data(ambiguous_data, label=0)
            ambiguous_data_1 = get_label_data(ambiguous_data, label=1)
            ambiguous_data  = pd.concat([ambiguous_data_0, ambiguous_data_1]).reset_index(drop=True)
            ambiguous_data['mixup_type'] = 'same_ambiguous'
            
            final_data = pd.concat([easy_data, ambiguous_data]).sample(frac=1).reset_index(drop=True)
            
            if include_none:
                # none-none mixup
                none_data = data[data['category'] == 'none'].reset_index(drop=True)
                none_data_0 = get_label_data(none_data, label=0)
                none_data_1 = get_label_data(none_data, label=1)
                none_data  = pd.concat([none_data_0, none_data_1]).reset_index(drop=True)
                none_data['mixup_type'] = 'same_none'
                
                final_data = pd.concat([final_data, none_data]).sample(frac=1).reset_index(drop=True)
            
            return final_data
            
#             # Easy-Ambi Mixup
#             easy_ambiguous_len = data_len - len(final_data)
            
#             easy_0 = easy_data_0.sample(n=min(easy_ambiguous_len//2, len(easy_data_0), len(ambiguous_data_0)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_0 = ambiguous_data_0.sample(n=min(easy_ambiguous_len//2, len(easy_data_0), len(ambiguous_data_0)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_0 = ambiguous_0.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
#             easy_ambiguous_0 = pd.concat([easy_0, ambiguous_0], axis=1).reset_index(drop=True)

#             easy_1 = easy_data_1.sample(n=min(easy_ambiguous_len//2, len(easy_data_1), len(ambiguous_data_1)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_1 = ambiguous_data_1.sample(n=min(easy_ambiguous_len//2, len(easy_data_1), len(ambiguous_data_1)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             ambiguous_1 = ambiguous_1.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
#             easy_ambiguous_1 = pd.concat([easy_1, ambiguous_1], axis=1).reset_index(drop=True)
            
#             easy_ambiguous_data = pd.concat([easy_ambiguous_0, easy_ambiguous_1]).reset_index(drop=True)
#             easy_ambiguous_data['mixup_type'] = 'easy_ambiguous'
#             return pd.concat([final_data, easy_ambiguous_data]).sample(frac=1).reset_index(drop=True)
        
        else:
            # Easy-Easy mixup
            easy_data = data[data['category'] == 'easy'].reset_index(drop=True)
            easy_temp = easy_data.copy()
            easy_temp = easy_temp.sample(frac=1).reset_index(drop=True)
            easy_temp = easy_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
            easy_data = pd.concat([easy_data, easy_temp], axis=1).reset_index(drop=True)
            easy_data['mixup_type'] = 'same_easy'

            # Ambi-Ambi mixup
            ambiguous_data = data[data['category'] == 'ambiguous'].reset_index(drop=True)
            ambiguous_temp = ambiguous_data.copy()
            ambiguous_temp = ambiguous_temp.sample(frac=1).reset_index(drop=True)
            ambiguous_temp = ambiguous_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
            ambiguous_data = pd.concat([ambiguous_data, ambiguous_temp], axis=1).reset_index(drop=True)
            ambiguous_data['mixup_type'] = 'same_ambiguous'
            
            final_data = pd.concat([easy_data, ambiguous_data]).sample(frac=1).reset_index(drop=True)
            
            if include_none:
                # None-None mixup
                none_data = data[data['category'] == 'none'].reset_index(drop=True)
                none_temp = none_data.copy()
                none_temp = none_temp.sample(frac=1).reset_index(drop=True)
                none_temp = none_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
                none_data = pd.concat([none_data, none_temp], axis=1).reset_index(drop=True)
                none_data['mixup_type'] = 'same_none'

                final_data = pd.concat([final_data, none_data]).sample(frac=1).reset_index(drop=True)
            
            return final_data
        
#             # Easy-Ambi Mixup
#             easy_ambiguous_len = data_len - len(final_data)
#             easy_ambiguous_data = easy_data.sample(n=min(easy_ambiguous_len, len(easy_data)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             easy_ambiguous_temp = ambiguous_data.sample(n=min(easy_ambiguous_len, len(ambiguous_data)))[['idx', 'text', 'label', 'category']].reset_index(drop=True)
#             easy_ambiguous_temp = easy_ambiguous_temp.rename(columns={'idx': 'idx_2', 'text': 'text_2', 'label': 'label_2', 'category': 'category_2'})
#             easy_ambiguous_data = pd.concat([easy_ambiguous_data, easy_ambiguous_temp], axis=1)
#             easy_ambiguous_data['mixup_type'] = 'easy_ambiguous'
            
#             return pd.concat([final_data, easy_ambiguous_data]).sample(frac=1).reset_index(drop=True)


In [10]:
# def prepare_dataset(data, info_data=None, mixup=False):
    
#     # Remove none and hard examples
#     data = data[(data['category'] != 'none') & (data['category'] != 'hard')].reset_index(drop=True)

#     # Add softmax and entropy info
#     if (info_data is not None) and mixup:
#         data = pd.merge(data, info_data, on='idx')[['idx', 'text', 'label', 'category', 'softmax', 'entropy']]
#         mixup_size = len(info_data) - len(data)

#         # --------------------------------------- Same class mixup ---------------------------------------  

#         # Easy-Easy Mixup
#         easy_data = data[data['category'] == 'easy']
#         easy_low_ent_idx = easy_data.sort_values('entropy', ascending=True).head(mixup_size//2)['idx'].tolist()
#         easy_high_ent_idx = easy_data.sort_values('entropy', ascending=False).head(mixup_size//2)['idx'].tolist()
        
#         easy_mixup_data = easy_data[easy_data['idx'].isin(easy_low_ent_idx)].reset_index(drop=True)
#         random.shuffle(easy_high_ent_idx)
#         easy_mixup_data['idx_2'] = easy_high_ent_idx
#         easy_mixup_data['text_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['text'].values[0])
#         easy_mixup_data['label_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['label'].values[0])
#         easy_mixup_data['category_2'] = easy_mixup_data['idx_2'].apply(lambda x: easy_data[easy_data['idx'] == x]['category'].values[0])
#         easy_mixup_data['mixup_type'] = 'same_easy'
        
#         # Ambi-Ambi Mixup
#         ambiguous_data = data[data['category'] == 'ambiguous']
#         ambiguous_low_ent_idx = ambiguous_data.sort_values('entropy', ascending=True).head(mixup_size//2)['idx'].tolist()
#         ambiguous_high_ent_idx = ambiguous_data.sort_values('entropy', ascending=False).head(mixup_size//2)['idx'].tolist()
        
#         ambiguous_mixup_data = ambiguous_data[ambiguous_data['idx'].isin(ambiguous_low_ent_idx)].reset_index(drop=True)
#         random.shuffle(ambiguous_high_ent_idx)
#         ambiguous_mixup_data['idx_2'] = ambiguous_high_ent_idx
#         ambiguous_mixup_data['text_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['text'].values[0])
#         ambiguous_mixup_data['label_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['label'].values[0])
#         ambiguous_mixup_data['category_2'] = ambiguous_mixup_data['idx_2'].apply(lambda x: ambiguous_data[ambiguous_data['idx'] == x]['category'].values[0])
#         ambiguous_mixup_data['mixup_type'] = 'same_ambiguous'
        
#         return pd.concat([easy_mixup_data, ambiguous_mixup_data]).sample(frac=1).reset_index(drop=True)
    
#     return data

In [11]:
def get_count(df, label1, label2):
    cnt_label = 0
    cnt_category = 0
    for i in range(len(df)):
        if df.at[i, label1] == df.at[i, f"{label1}_2"]:
            cnt_label += 1
        if df.at[i, label2] == df.at[i, f"{label2}_2"]:
            cnt_category += 1

    print("same label: ", cnt_label/len(df))
    print("same category: ", cnt_category/len(df))

In [12]:
data_name = 'sarcasm'

In [13]:
df = pd.read_csv(f'/projects/metis2/atharvak/Data_Cartography/datasets/{data_name}/{data_name}_categorized.csv')
df

Unnamed: 0,idx,text,label,category
0,0,The only thing I got from college is a caffein...,1,hard
1,1,I love it when professors draw a big question ...,1,ambiguous
2,2,Remember the hundred emails from companies whe...,1,ambiguous
3,3,Today my pop-pop told me I was not “forced” to...,1,ambiguous
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,ambiguous
...,...,...,...,...
3462,3462,The population spike in Chicago in 9 months is...,0,none
3463,3463,You'd think in the second to last English clas...,0,none
3464,3464,I’m finally surfacing after a holiday to Scotl...,0,easy
3465,3465,Couldn't be prouder today. Well done to every ...,0,easy


In [14]:
df['category'].value_counts()

ambiguous    1144
easy         1144
none          978
hard          201
Name: category, dtype: int64

In [15]:
info_df = pd.read_csv(f'/projects/metis2/atharvak/Data_Cartography/dy_log/{data_name}/roberta-base/training_dynamics/final_4.csv')
info_df = info_df.rename(columns={'guid': 'idx', 'sm': 'softmax', 'en': 'entropy'})
info_df = info_df[['idx', 'gold', 'softmax', 'entropy']]
info_df

Unnamed: 0,idx,gold,softmax,entropy
0,3061,0,[0.96356308 0.03643692],0.156450
1,326,1,[0.22491207 0.77508793],0.533055
2,2802,0,[0.99220995 0.00779005],0.045580
3,365,1,[0.00963458 0.99036542],0.054316
4,2770,0,[0.99644967 0.00355033],0.023570
...,...,...,...,...
3462,2640,0,[0.96515393 0.03484607],0.151204
3463,1204,0,[0.99699631 0.00300369],0.020444
3464,1370,0,[0.99511012 0.00488988],0.030895
3465,2704,0,[0.99134994 0.00865006],0.049702


In [16]:
normal_dataset = prepare_dataset_original(df, include_none=True)
normal_dataset

Unnamed: 0,idx,text,label,category
0,1,I love it when professors draw a big question ...,1,ambiguous
1,2,Remember the hundred emails from companies whe...,1,ambiguous
2,3,Today my pop-pop told me I was not “forced” to...,1,ambiguous
3,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,ambiguous
4,5,"@jimrossignol I choose to interpret it as ""XD""...",1,ambiguous
...,...,...,...,...
3261,3462,The population spike in Chicago in 9 months is...,0,none
3262,3463,You'd think in the second to last English clas...,0,none
3263,3464,I’m finally surfacing after a holiday to Scotl...,0,easy
3264,3465,Couldn't be prouder today. Well done to every ...,0,easy


In [18]:
random_mixup_dataset = prepare_dataset_random_mixup(df, info_data=info_df, include_none=True, use_label=False)
get_count(random_mixup_dataset, 'label', 'category')
random_mixup_dataset

same label:  0.6270667483159829
same category:  0.3303735456215554


Unnamed: 0,idx,text,label,category,idx_2,text_2,label_2,category_2
0,1514,i want :( a corn dog :(( so bad wtf :(((,0,none,2377,i keep thinking about the time that @becelliso...,0,easy
1,2108,"Hindsight is a wonderful thing, but surely the...",0,ambiguous,636,No longer a female as I refuse to wear heels e...,1,ambiguous
2,1617,Casually looking at the 06Z forecast it seems ...,0,none,2190,HAPPY TWENTIETH @ocodom24!Eat lots of Poptarts...,0,easy
3,1329,@sonofsama1 @BoqorofCeel U got more than me🥺,0,ambiguous,413,why is the weather having a mid life crisis,1,ambiguous
4,3021,"okay ""deja vu"" is doing something for me that ...",0,none,3140,"went to the bathroom at about 5am yesterday, m...",0,none
...,...,...,...,...,...,...,...,...
3261,1731,have to somehow become fluent in Spanish in th...,0,none,1103,i just sewed through my finger and my brain is...,0,ambiguous
3262,2399,"But by far, my favorite part was a very drunk ...",0,none,2125,I hired a housekeeper...I didn’t want to but I...,0,easy
3263,2700,can u believe shaz hung up on me to see if her...,0,ambiguous,2184,Nothing pisses me off more than people with do...,0,easy
3264,1038,Just came online to remind y'all that Davido i...,0,none,1524,The nice thing about being back in school is t...,0,easy


In [21]:
random_mixup_dataset = prepare_dataset_random_mixup(df, info_data=info_df, include_none=True, use_label=True)
get_count(random_mixup_dataset, 'label', 'category')
random_mixup_dataset

same label:  1.0
same category:  0.535517452541335


Unnamed: 0,idx,text,label,category,idx_2,text_2,label_2,category_2
0,1043,The @theAJpub is open again @gpollakis 👀,0,none,1679,This second jab has made me feel off the box t...,0,easy
1,837,@majornelson Sounds like a strong contender fo...,1,ambiguous,613,Matt Hancock is a top shagger,1,ambiguous
2,3386,All I needed was some Lady Gaga and some Nicki...,0,none,2711,10 years today since I left school. I’m in sho...,0,easy
3,1728,i just overheard this bunch of children behind...,0,none,2386,#LoveIsland been back for one night and I'm al...,0,none
4,1598,"to be completely honest, my ego is MASSIVE and...",0,easy,3443,Only want someone who loves dancing just as mu...,0,easy
...,...,...,...,...,...,...,...,...
3261,814,tex-mex restaurants that stop serving breakfas...,1,ambiguous,446,Love that someone broke into my car this morning,1,ambiguous
3262,1827,Reminding myself that things take time &amp; I...,0,easy,926,Nothing can ruin a summer like hayfever,0,easy
3263,2312,Anyone else remember Twix Tops and M&amp;M's B...,0,none,2156,The end of of S3E2 money heist makes me v happy,0,none
3264,106,@DonaldJTrumpJr If only he had a press Corp in...,1,ambiguous,764,If anyone wants to know how my nights going I ...,1,ambiguous


In [23]:
category_mixup_dataset = prepare_dataset_category_mixup(df, info_data=info_df, include_none=True, use_label=False, use_entropy=False)
get_count(category_mixup_dataset, 'label', 'category')
category_mixup_dataset

same label:  0.8560930802204532
same category:  1.0


Unnamed: 0,idx,text,label,category,idx_2,text_2,label_2,category_2,mixup_type
0,1223,HOW DO THEY GET THEIR HORSES TO JAPAN,0,easy,3109,Give the #FarmersProtests at least half the ti...,0,easy,same_easy
1,1311,Can anyone enlighten me as to what is near Imp...,0,none,3318,every time someone asks me about getting a pie...,0,none,same_none
2,3326,Why am I sad for no reason ALL THE TIME,0,easy,2555,IM GETTING MY NAILS DONE TODAY!!!!!,0,easy,same_easy
3,2558,@RachaelSmarty Honestly cannot wait to give th...,0,none,1416,@Naughty_Dog Remake them like Spyro and Crash ...,0,none,same_none
4,517,I love when I learn about stuff I've apparentl...,1,ambiguous,617,Just watched a man smoke crack on the R train ...,1,ambiguous,same_ambiguous
...,...,...,...,...,...,...,...,...,...
3261,670,@AmazonUK why does Alexa UK not get improvemen...,1,ambiguous,350,Yay for being locked out of your own house be...,1,ambiguous,same_ambiguous
3262,2393,I despise 2021,0,none,1073,No but the worst type of boy is the one that w...,0,none,same_none
3263,2533,Resisting homemade cheesecake at work for 3 da...,0,ambiguous,578,Can’t wait to see Ed sheeran at tramlines tomoz x,1,ambiguous,same_ambiguous
3264,92,@rits_meg @freedomsenpai hot loli,1,ambiguous,220,day 5 of people being mad about ellen sitting ...,1,ambiguous,same_ambiguous


In [25]:
category_mixup_dataset = prepare_dataset_category_mixup(df, info_data=info_df, include_none=True, use_label=True, use_entropy=False)
get_count(category_mixup_dataset, 'label', 'category')
category_mixup_dataset

same label:  1.0
same category:  1.0


Unnamed: 0,idx,text,label,category,idx_2,text_2,label_2,category_2,mixup_type
0,3459,"sorry folks,,, but new t swift album bangs",0,easy,3147,i’d like to share that i am the happiest i’ve ...,0,easy,same_easy
1,1020,In terms of usability is there a more frustrat...,0,easy,1035,missed this the first time round but a convers...,0,easy,same_easy
2,1957,My mother is going to Bonnaroo today I love her,0,easy,2164,seonghwa is still one of the most gorgeous men...,0,easy,same_easy
3,412,Anyone else hear some like thunder or something?,1,ambiguous,162,no more instagram. we must all return to scrap...,1,ambiguous,same_ambiguous
4,2231,Reina when we first put on her Halloween costu...,0,easy,3154,Just a casual day at Oklahoma state university...,0,easy,same_easy
...,...,...,...,...,...,...,...,...,...
3261,1847,Finding out my internship keeps Coke products ...,0,easy,2273,I've started an etsy shop selling handmade cla...,0,easy,same_easy
3262,3430,Chloe and Toby just minding their business 🤣,0,none,1777,days before rodeo is Travis Scott's best project,0,none,same_none
3263,828,People who think cancel culture goes too far n...,1,ambiguous,5,"@jimrossignol I choose to interpret it as ""XD""...",1,ambiguous,same_ambiguous
3264,5,"@jimrossignol I choose to interpret it as ""XD""...",1,ambiguous,73,You know the wolves match is boring when you'r...,1,ambiguous,same_ambiguous


In [27]:
category_mixup_dataset = prepare_dataset_category_mixup(df, info_data=info_df, include_none=True, use_label=False, use_entropy=True)
get_count(category_mixup_dataset, 'label', 'category')
category_mixup_dataset

same label:  0.8677281077770974
same category:  1.0


Unnamed: 0,idx,text,label,category,softmax,entropy,idx_2,text_2,label_2,category_2,softmax_2,entropy_2,mixup_type
0,2356,u know those days where you just need to stare...,0,easy,[0.99150319 0.00849681],0.048974,3201,Manage your personal budget and work out how b...,0,easy,[0.99621554 0.00378446],0.024883,same_easy
1,1556,@coffeedreamer1 @kingssnacks Omg me too I have...,0,none,[0.98205954 0.01794046],0.089912,1949,"Teenage daughters and I learned how to play ""C...",0,none,[0.99273302 0.00726698],0.043026,same_none
2,1261,"bye Obama, luv u",0,none,[0.99536345 0.00463655],0.029542,1904,I have more fun with my parents and boyfriend ...,0,none,[0.9510251 0.0489749],0.195486,same_none
3,2154,Well that was humiliating for India. No clouds...,0,none,[0.99223256 0.00776744],0.045470,2957,"me: *finishes book 100 of the year, gets excit...",0,none,[0.98345804 0.01654196],0.084257,same_none
4,2259,how close i am to losing it. https://t.co/iQ5o...,0,easy,[0.99434516 0.00565484],0.034904,2697,Dating apps are like trying to find a snack in...,0,easy,[0.99468073 0.00531927],0.033159,same_easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261,2343,"the media really likes the word 'allegedly', i...",0,none,[0.96134311 0.03865689],0.163652,2517,If the COVID variants make its way to the lett...,0,none,[0.9950097 0.0049903],0.031428,same_none
3262,3317,craving hot sauce,0,easy,[0.98899906 0.01100094],0.060552,1488,Excited to share the latest addition to my #et...,0,easy,[0.99682422 0.00317578],0.021438,same_easy
3263,2056,I don't follow the Women's Super League becaus...,0,easy,[0.99318085 0.00681915],0.040810,1287,A message to all Muslims and Refugees: I'm sor...,0,easy,[0.99565069 0.00434931],0.027990,same_easy
3264,2264,@humorandanimals There is nothing better on th...,0,easy,[0.99400428 0.00599572],0.036656,2033,Chris Krebs: There is no foreign power that is...,0,easy,[0.99495881 0.00504119],0.031697,same_easy


In [22]:
category_mixup_dataset = prepare_dataset_category_mixup(df, info_data=info_df, use_label=True, use_entropy=True)
get_count(category_mixup_dataset, 'label', 'category')
category_mixup_dataset

same label:  1.0
same category:  0.8706240487062404


Unnamed: 0,idx,text,label,category,softmax,entropy,idx_2,text_2,label_2,category_2,softmax_2,entropy_2,mixup_type
0,1114,i want thai food Right Now :(,0,easy,[0.99444999 0.00555001],0.034361,1372,“there is no greater joy than to have an endle...,0,easy,[0.99459495 0.00540505],0.033607,same_easy
1,2725,my new glasses prescription is messing with my...,0,easy,[0.99613885 0.00386115],0.025309,2501,My adventure this week was going to a park and...,0,easy,[0.99174245 0.00825755],0.047832,same_easy
2,1529,Has anyone written about the complexity of sic...,0,ambiguous,[0.98996828 0.01003172],0.056147,2073,hey @phoebe_bridgers i’m free tomorrow btw. Th...,0,ambiguous,[0.98022248 0.01977752],0.097172,same_ambiguous
3,428,Just disposed of a dead opossum. Good morning!,1,ambiguous,[0.00816608 0.99183392],0.047393,797,Afghanistan just completely crumbling after sp...,1,ambiguous,[0.08395871 0.91604129],0.288333,same_ambiguous
4,2134,Entering my third year of consistent yoga prac...,0,easy,,,1538,I feel like most teens/women are not told that...,0,ambiguous,,,easy_ambiguous
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,3317,craving hot sauce,0,easy,[0.98899906 0.01100094],0.060552,1488,Excited to share the latest addition to my #et...,0,easy,[0.99682422 0.00317578],0.021438,same_easy
2624,2872,This bunch of players is something special. Re...,0,easy,,,2549,"if Covid didn’t happen, I’d have free travel r...",0,ambiguous,,,easy_ambiguous
2625,2287,80% of #LittleHouseonthePrairie episodes invol...,0,easy,[0.98806428 0.01193572],0.064718,3027,horseshoe crabs are so sweet and gentle i love...,0,easy,[0.99697631 0.00302369],0.020560,same_easy
2626,502,Shaving in the shower without your glasses/con...,1,ambiguous,[0.017154 0.982846],0.086746,297,The fact I nearly froze on my way to work this...,1,ambiguous,[0.01566353 0.98433647],0.080644,same_ambiguous


In [7]:
class MixupDataset(Dataset):
    
    def __init__(
        self, 
        tokenizer, 
        data: pd.DataFrame,
        sampling_type: str
    ):
        self.data = data
        self.sampling_type = sampling_type
        
        if self.sampling_type == 'sequential':
            sorting_dict = {
                'non_mixup_easy': 0,
                'non_mixup_ambiguous': 1,
                'same_easy': 2,
                'different_easy': 3,
                'same_ambiguous': 4,
                'different_ambiguous': 5
            }
            self.data['data_type'] = self.data['mixup_type'] + '_' + self.data['category']
            self.data = self.data.iloc[self.data.data_type.map(sorting_dict).argsort()].reset_index(drop=True)
            
        self.tokenizer = tokenizer
        self.tokenized_data = tokenizer.batch_encode_plus(
            self.data[INPUT_COLUMN].tolist(),
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_token_type_ids=True,            
            return_tensors='pt'
        )
        

                
    def __len__(
        self
    ):
        return len(self.data)
    
    
    
    def __getitem__(
        self,
        index: int
    ):
        data = {
            'input_ids_1': self.tokenized_data['input_ids'][index].flatten(),
            'attention_mask_1': self.tokenized_data['attention_mask'][index].flatten(),
            'labels_1': torch.tensor(self.data.iloc[index][OUTPUT_COLUMN], dtype=torch.long),
        }
        
        idx2 = self.data.iloc[index]['idx_2']

        index2 = self.data[self.data['idx'] == idx2].index[0]
        data['input_ids_2'] = self.tokenized_data['input_ids'][index2].flatten()
        data['attention_mask_2'] = self.tokenized_data['attention_mask'][index2].flatten()
        data['labels_2'] = torch.tensor(self.data.iloc[index2][OUTPUT_COLUMN], dtype=torch.long)

        return data

In [8]:
train_dataset = MixupDataset(tokenizer=tokenizer, data=dataset, sampling_type='random')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)

## Training loop

In [6]:
def train(model, tokenizer, optimizer, device, train_loader, num_epochs, output_dir):
    losses = []
    train_iterator = trange(int(num_epochs), desc='Epoch')
    for _ in train_iterator:
        tr_loss = 0
        step = None
        epoch_iterator = tqdm(train_loader, desc='Training')
        for step, batch in enumerate(epoch_iterator):
            model.train()

            inputs = {}
            for k, v in batch.items():
                if isinstance(v, list):
                    inputs[k] = None
                else:
                    inputs[k] = v.to(device)

            labels = inputs['labels_1']

            optimizer.zero_grad()

            outputs = model(**inputs)
            out = outputs['logits'].double().to(device)
            loss = outputs['loss']

            loss.backward()
            optimizer.step()
    
            tr_loss += loss.item()
        losses.append(tr_loss/(step+1))
        print('train loss: {}'.format(tr_loss/(step+1)))

    # save model and tokenizer
    print('Saving model and tokenizer')

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)


In [7]:
def eval(model, eval_loader, device, with_labels=True):
    probs = None
    gold_labels = None

    eval_loss = 0
    step = None
    eval_iterator = tqdm(eval_loader, desc='Evaluating')
    for step, batch in enumerate(eval_iterator):
        model.eval()

        with torch.no_grad():
            
            inputs = {k:v.to(device) for k, v in batch.items()}
            labels = inputs['labels_1']
            del inputs['labels_1']
            del inputs['labels_2']
 
            outputs = model(**inputs)

            out = outputs['logits'].double().to(device)
            out = F.softmax(out, dim=1)

            loss = outputs['loss']

            if probs is None:
                probs = out.detach().cpu().numpy()
                if with_labels:
                    gold_labels = labels.detach().cpu().numpy()
            else:
                probs = np.append(probs, out.detach().cpu().numpy(), axis=0)
                if with_labels:
                    gold_labels = np.append(gold_labels, labels.detach().cpu().numpy(), axis=0)

            if with_labels:
                eval_loss += loss.item()
    
    if with_labels:
        eval_loss /= (step+1)
        print('eval loss: {}'.format(eval_loss))

        # compute accuracy
        preds = np.argmax(probs, axis=1)
        accuracy = np.sum(preds == gold_labels)/len(preds)
        print('eval accuracy: {}'.format(accuracy))

    return probs

In [1]:
import json
import pandas as pd

In [2]:
with open('../dy_log/imdb/roberta-base/three_regions_data_indices.json', 'r') as f:
    data_imdb = json.load(f)
f.close()
print(data_imdb.keys())

df_imdb = pd.read_csv('/projects/ogma3/atharvak/Data_Cartography/datasets/imdb/train.csv')
len(df_imdb), len(data_imdb['easy']), len(data_imdb['ambiguous']), len(data_imdb['hard']), len(data_imdb['easy'])/len(df_imdb)

dict_keys(['hard', 'easy', 'ambiguous'])


(25000, 8250, 8250, 8250, 0.33)

In [6]:
set(data_imdb['easy']).intersection(set(data_imdb['ambiguous'])), set(data_imdb['easy']).intersection(set(data_imdb['hard'])), set(data_imdb['hard']).intersection(set(data_imdb['ambiguous']))

(set(),
 set(),
 {0,
  13,
  14,
  20,
  23,
  32,
  39,
  48,
  50,
  52,
  53,
  55,
  61,
  65,
  69,
  70,
  74,
  77,
  79,
  80,
  81,
  84,
  87,
  89,
  90,
  92,
  97,
  98,
  105,
  108,
  109,
  122,
  123,
  124,
  130,
  132,
  133,
  136,
  138,
  142,
  156,
  157,
  158,
  163,
  167,
  168,
  171,
  173,
  179,
  180,
  188,
  190,
  198,
  199,
  200,
  224,
  228,
  233,
  236,
  245,
  246,
  252,
  255,
  257,
  260,
  263,
  264,
  267,
  271,
  278,
  279,
  280,
  281,
  285,
  289,
  292,
  299,
  301,
  304,
  306,
  313,
  318,
  336,
  339,
  340,
  342,
  345,
  346,
  348,
  354,
  362,
  366,
  367,
  371,
  373,
  375,
  377,
  378,
  384,
  387,
  394,
  397,
  403,
  405,
  406,
  408,
  410,
  413,
  418,
  419,
  421,
  427,
  430,
  440,
  441,
  446,
  455,
  457,
  459,
  461,
  464,
  469,
  471,
  474,
  475,
  476,
  481,
  483,
  488,
  492,
  497,
  498,
  499,
  501,
  504,
  509,
  511,
  515,
  516,
  518,
  522,
  526,
  527,
  528,
  532

In [3]:
with open('../dy_log/sst2/roberta-base/three_regions_data_indices.json', 'r') as f:
    data_sst2 = json.load(f)
f.close()
print(data_sst2.keys())
df_sst2 = pd.read_csv('/projects/ogma3/atharvak/Data_Cartography/datasets/sst2/train.csv')
len(df_sst2), len(data_sst2['easy']), len(data_sst2['ambiguous']), len(data_sst2['hard']), len(data_sst2['easy'])/len(df_sst2)

dict_keys(['hard', 'easy', 'ambiguous'])


(67349, 22225, 22225, 22225, 0.3299974758348305)

In [4]:
with open('../dy_log/yelp_polarity/roberta-base/three_regions_data_indices.json', 'r') as f:
    data_yelp_polarity = json.load(f)
f.close()
print(data_yelp_polarity.keys())
df_yelp_polarity = pd.read_csv('/projects/ogma3/atharvak/Data_Cartography/datasets/yelp_polarity/train.csv')
len(df_yelp_polarity), len(data_yelp_polarity['easy']), len(data_yelp_polarity['ambiguous']), len(data_yelp_polarity['hard']), len(data_yelp_polarity['easy'])/len(df_yelp_polarity)

dict_keys(['hard', 'easy', 'ambiguous'])


(560000, 184800, 184800, 184800, 0.33)