In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from data.constants import ALL_LABELS_SORTED
import ast
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments,EvalPrediction
import torch
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

# Preprocess input

In [2]:
# Path to csv files
WD = '../'
data_path = WD + 'data/'
save_path=WD +'data/milestone2'
train_df = pd.read_csv(data_path + 'train_data.csv')
valid_df = pd.read_csv(data_path + 'valid_data.csv')
test_df = pd.read_csv(data_path + 'test_data.csv')

In [3]:
def preprocess_labels(df: pd.DataFrame):
    labels =  df.iloc[:, [0, 2]].copy()
    labels.Labels = labels.Labels.apply(ast.literal_eval)
    mlb = MultiLabelBinarizer(classes=list(ALL_LABELS_SORTED.keys()))
    labels_transformed = mlb.fit_transform(labels['Labels'])
    labels[mlb.classes_] = labels_transformed
    return  labels


In [4]:
train_labels=preprocess_labels(train_df)
valid_labels=preprocess_labels(valid_df)
test_labels=preprocess_labels(test_df)

train_sent=train_df['Text'].tolist()
valid_sent=valid_df["Text"].tolist()
test_sent=test_df["Text"].tolist()

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,problem_type="multi_label_classification")

train_encodings = tokenizer(train_sent,truncation=True,padding=True,return_tensors='pt')
valid_encodings = tokenizer(valid_sent, truncation=True, padding=True, return_tensors='pt')
test_encodings=tokenizer(test_sent, truncation=True, padding=True, return_tensors='pt')

In [5]:
class BRISEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.FloatTensor(self.labels.iloc[idx,2:])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
train_dataset = BRISEDataset(train_encodings, train_labels)
valid_dataset=BRISEDataset(valid_encodings,valid_labels)
test_dataset=BRISEDataset(test_encodings,test_labels)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=len(ALL_LABELS_SORTED),problem_type="multi_label_classification").to("cuda")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [8]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

In [10]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [11]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
                                           tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
batch_size=32
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",
                                  num_train_epochs=6,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  optim="adamw_torch",
                                  metric_for_best_model = "f1",
                                  save_strategy = "epoch")

In [12]:
from transformers import  Trainer,ProgressCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[ProgressCallback]
)


In [13]:
trainer.train()

***** Running training *****
  Num examples = 5368
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1008
  Number of trainable parameters = 278115165


  0%|          | 0/1008 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.1706, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.0674014613032341, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.3570536828963795, 'eval_runtime': 5.6425, 'eval_samples_per_second': 141.958, 'eval_steps_per_second': 4.608, 'epoch': 1.0}


Saving model checkpoint to test_trainer\checkpoint-168
Configuration saved in test_trainer\checkpoint-168\config.json
Model weights saved in test_trainer\checkpoint-168\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.0692, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.0633467584848404, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.3570536828963795, 'eval_runtime': 5.6609, 'eval_samples_per_second': 141.498, 'eval_steps_per_second': 4.593, 'epoch': 2.0}


Saving model checkpoint to test_trainer\checkpoint-336
Configuration saved in test_trainer\checkpoint-336\config.json
Model weights saved in test_trainer\checkpoint-336\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.0672, 'learning_rate': 2.5e-05, 'epoch': 3.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.06232898309826851, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_accuracy': 0.3570536828963795, 'eval_runtime': 5.6657, 'eval_samples_per_second': 141.378, 'eval_steps_per_second': 4.589, 'epoch': 3.0}


Saving model checkpoint to test_trainer\checkpoint-504
Configuration saved in test_trainer\checkpoint-504\config.json
Model weights saved in test_trainer\checkpoint-504\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.0648, 'learning_rate': 1.6666666666666667e-05, 'epoch': 4.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.058998193591833115, 'eval_f1': 0.0018674136321195143, 'eval_roc_auc': 0.5004290690193597, 'eval_accuracy': 0.3570536828963795, 'eval_runtime': 5.643, 'eval_samples_per_second': 141.946, 'eval_steps_per_second': 4.607, 'epoch': 4.0}


Saving model checkpoint to test_trainer\checkpoint-672
Configuration saved in test_trainer\checkpoint-672\config.json
Model weights saved in test_trainer\checkpoint-672\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.0612, 'learning_rate': 8.333333333333334e-06, 'epoch': 5.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.05636738985776901, 'eval_f1': 0.2936202920830131, 'eval_roc_auc': 0.5894424113539374, 'eval_accuracy': 0.39325842696629215, 'eval_runtime': 5.605, 'eval_samples_per_second': 142.908, 'eval_steps_per_second': 4.639, 'epoch': 5.0}


Saving model checkpoint to test_trainer\checkpoint-840
Configuration saved in test_trainer\checkpoint-840\config.json
Model weights saved in test_trainer\checkpoint-840\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


{'loss': 0.0578, 'learning_rate': 0.0, 'epoch': 6.0}


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.05407628044486046, 'eval_f1': 0.3390357698289269, 'eval_roc_auc': 0.6024163718274428, 'eval_accuracy': 0.4232209737827715, 'eval_runtime': 5.7171, 'eval_samples_per_second': 140.106, 'eval_steps_per_second': 4.548, 'epoch': 6.0}


Saving model checkpoint to test_trainer\checkpoint-1008
Configuration saved in test_trainer\checkpoint-1008\config.json
Model weights saved in test_trainer\checkpoint-1008\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 811.7795, 'train_samples_per_second': 39.676, 'train_steps_per_second': 1.242, 'train_loss': 0.08179041317531041, 'epoch': 6.0}


TrainOutput(global_step=1008, training_loss=0.08179041317531041, metrics={'train_runtime': 811.7795, 'train_samples_per_second': 39.676, 'train_steps_per_second': 1.242, 'train_loss': 0.08179041317531041, 'epoch': 6.0})

In [14]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 801
  Batch size = 32


  0%|          | 0/26 [00:00<?, ?it/s]

{'eval_loss': 0.05407628044486046,
 'eval_f1': 0.3390357698289269,
 'eval_roc_auc': 0.6024163718274428,
 'eval_accuracy': 0.4232209737827715,
 'eval_runtime': 5.8876,
 'eval_samples_per_second': 136.048,
 'eval_steps_per_second': 4.416,
 'epoch': 6.0}

In [61]:
## tof flush cuda memory
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 665
  Batch size = 32


  0%|          | 0/21 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-2.227153  , -3.183923  , -2.8014703 , ..., -6.3214087 ,
        -6.1909866 , -6.4097013 ],
       [-2.226734  , -3.1806812 , -2.7976348 , ..., -6.3163877 ,
        -6.1874366 , -6.4073997 ],
       [-2.228924  , -3.1837628 , -2.8006928 , ..., -6.320426  ,
        -6.191559  , -6.4103484 ],
       ...,
       [ 1.0460296 , -1.3106427 , -2.706685  , ..., -5.158093  ,
        -5.1092906 , -5.2438703 ],
       [ 0.87409073, -1.4322498 , -2.7244415 , ..., -5.251674  ,
        -5.2123957 , -5.339251  ],
       [-2.226411  , -3.1841822 , -2.8010347 , ..., -6.319638  ,
        -6.1912184 , -6.409416  ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.05353706702589989, 'test_f1': 0.3233082706766917, '