# Load Dataset from HF
Loading from Quesmed organization

In [1]:
from datasets import load_dataset

ds = load_dataset('quesmed/comment_sentiment', token=True)
ds

DatasetDict({
    test: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classYear', 'universityId', 'country', 'universityName', 'parentId', 'questionId', 'comment', 'review', 'negative', 'neutral', 'positive', 'tone', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'emotion', 'educational', 'giving feedback', 'asking a question', 'insulting', 'supporting', 'humour', 'frustration', 'theme'],
        num_rows: 15
    })
    train: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classYear', 'universityId', 'country', 'universityName', 'parentId', 'questionId', 'comment', 'review', 'negative', 'neutral', 'positive', 'tone', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'emotion', 'educational', 'giving feedback', 'asking a question', 'insulting', 'supporting', 'humour', 'frustration', 'theme'],
        num_rows: 120
    })
    validate: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classY

In [2]:
from datasets import Dataset
def isolate_dataset(ds: Dataset, feature: str):
    cols = ds.column_names['train']
    col_keep = {'comment', feature}
    
    ds_filter = ds.remove_columns(col_keep.symmetric_difference(cols))
    ds_filter = ds_filter.rename_column(feature, 'label')
    ds_filter = ds_filter.class_encode_column('label')

    return ds_filter

## Setup model and trainer

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def init_model(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    return (tokenizer, config, model)

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

In [5]:
from transformers import Trainer, TrainingArguments, logging
from datasets import Dataset
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

logging.set_verbosity_error()

def setup_trainer(name: str, dataset: Dataset, model, tokenizer):
    model_name = f"fine-tuning-chkp/{name}"

    training_args = TrainingArguments(
        output_dir=model_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=0.2,
        num_train_epochs=5,
        learning_rate=2e-5,
        weight_decay=0.01,
        metric_for_best_model="accuracy",
        load_best_model_at_end=True,
        disable_tqdm=False,
        use_mps_device=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validate'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    return trainer

# Fine-tuning Tone

In [6]:
tone_tokenizer, tone_config, tone_model = init_model("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [7]:
ds_tone = isolate_dataset(ds, 'tone')

ds_tone = ds_tone.map(
  lambda row: tone_tokenizer(row['comment'], max_length=512, padding='max_length', truncation=True, return_tensors='pt'), 
  batched=True,
  remove_columns=['comment']
)

ds_tone['train'].features

Casting to class labels:   0%|          | 0/15 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/120 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
tone_trainer = setup_trainer('tone', dataset=ds_tone, model=tone_model, tokenizer=tone_tokenizer)



In [9]:
result = tone_trainer.train()
print_summary(result)



  0%|          | 0/75 [00:00<?, ?it/s]

{'loss': 0.7455, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.514501690864563, 'eval_accuracy': 0.7333333333333333, 'eval_runtime': 0.525, 'eval_samples_per_second': 28.572, 'eval_steps_per_second': 3.81, 'epoch': 1.0}
{'loss': 0.3538, 'learning_rate': 1.2e-05, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.5528842806816101, 'eval_accuracy': 0.7333333333333333, 'eval_runtime': 0.3137, 'eval_samples_per_second': 47.823, 'eval_steps_per_second': 6.376, 'epoch': 2.0}
{'loss': 0.1785, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6121124625205994, 'eval_accuracy': 0.8, 'eval_runtime': 0.3222, 'eval_samples_per_second': 46.561, 'eval_steps_per_second': 6.208, 'epoch': 3.0}
{'loss': 0.0672, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.673171877861023, 'eval_accuracy': 0.8666666666666667, 'eval_runtime': 0.3303, 'eval_samples_per_second': 45.409, 'eval_steps_per_second': 6.054, 'epoch': 4.0}
{'loss': 0.0508, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.6994945406913757, 'eval_accuracy': 0.8666666666666667, 'eval_runtime': 0.3382, 'eval_samples_per_second': 44.357, 'eval_steps_per_second': 5.914, 'epoch': 5.0}
{'train_runtime': 70.4994, 'train_samples_per_second': 8.511, 'train_steps_per_second': 1.064, 'train_loss': 0.27914314905802406, 'epoch': 5.0}
Time: 70.50
Samples/second: 8.51


In [11]:
print(tone_trainer.state.best_model_checkpoint)
tone_trainer.save_model('fine-tuning-final/tone')

'fine-tuning-chkp/tone/checkpoint-60'

# Emotion fine-tuning

In [12]:
emotion_tokenizer, emotion_config, emotion_model = init_model("bhadresh-savani/distilbert-base-uncased-emotion")


In [13]:
# emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

ds_emotion = isolate_dataset(ds, 'emotion')

ds_emotion = ds_emotion.map(
  lambda row: emotion_tokenizer(row['comment'], max_length=512, padding='max_length', truncation=True, return_tensors='pt'), 
  batched=True,
  remove_columns=['comment']
)

ds_emotion['train'].features

Casting to class labels:   0%|          | 0/15 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/120 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'label': ClassLabel(names=['?puzzled', 'anger', 'fear', 'joy', 'sadness', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [14]:
emotion_trainer = setup_trainer('emotion', dataset=ds_emotion, model=emotion_model, tokenizer=emotion_tokenizer)



In [15]:
result = emotion_trainer.train()
print_summary(result)



  0%|          | 0/75 [00:00<?, ?it/s]

{'loss': 2.2664, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.4387818574905396, 'eval_accuracy': 0.4, 'eval_runtime': 0.2204, 'eval_samples_per_second': 68.062, 'eval_steps_per_second': 9.075, 'epoch': 1.0}
{'loss': 1.4542, 'learning_rate': 1.2e-05, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.3942002058029175, 'eval_accuracy': 0.4, 'eval_runtime': 0.1956, 'eval_samples_per_second': 76.675, 'eval_steps_per_second': 10.223, 'epoch': 2.0}
{'loss': 1.2952, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.343334436416626, 'eval_accuracy': 0.4, 'eval_runtime': 0.1937, 'eval_samples_per_second': 77.431, 'eval_steps_per_second': 10.324, 'epoch': 3.0}
{'loss': 1.1394, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.330267071723938, 'eval_accuracy': 0.4666666666666667, 'eval_runtime': 0.1954, 'eval_samples_per_second': 76.78, 'eval_steps_per_second': 10.237, 'epoch': 4.0}
{'loss': 1.0582, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.3357529640197754, 'eval_accuracy': 0.4666666666666667, 'eval_runtime': 0.1959, 'eval_samples_per_second': 76.58, 'eval_steps_per_second': 10.211, 'epoch': 5.0}
{'train_runtime': 36.6196, 'train_samples_per_second': 16.385, 'train_steps_per_second': 2.048, 'train_loss': 1.4426724370320638, 'epoch': 5.0}
Time: 36.62
Samples/second: 16.39


In [17]:
print(emotion_trainer.state.best_model_checkpoint)
emotion_trainer.save_model('fine-tuning-final/emotion')

fine-tuning-chkp/emotion/checkpoint-60


# Theme fine-tuning

In [53]:
feature = 'theme'
cols = ds.column_names['train']
col_keep = {'text', feature}

ds_theme = ds.remove_columns(col_keep.symmetric_difference(cols))
ds_theme = ds_theme.rename_column(feature, 'label')

In [60]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import random


theme_tokenizer, theme_config, theme_model = init_model("facebook/bart-large-mnli")
# Linear(in_features=1024, out_features=3, bias=True)
# {0: 'contradiction', 1: 'neutral', 2: 'entailment'}

theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']
num_labels = len(theme_labels)
template="This example is {}."

def create_input_sequence(sample):
    text = sample['text']
    label = sample['label'][0]
    contradiction_labels = theme_labels[:]
    label_idx = contradiction_labels.index(label)
    contradiction_labels.pop(label_idx)

    encoded_sequence = theme_tokenizer(
        text,
        [template.format(label)],
        # max_length=512,
        # padding='max_length', 
        truncation=True, 
        return_tensors='pt'
    )
    encoded_sequence['labels'] = [2]
    encoded_sequence['input_sentence'] = theme_tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

ds_theme_encoded = ds_theme.map(
    create_input_sequence, 
    batched=True, 
    batch_size=1,
    remove_columns=["label", "text"]
)

ds_theme_encoded


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
})

In [61]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [62]:
premise = ds_theme['train'][0]['text']
template= "This example is {}."
hypothesis = template.format(ds_theme['train'][0]['label'])

# run through model pre-trained on MNLI
x = theme_tokenizer(premise, hypothesis, 
                           truncation_strategy='only_first',
        return_tensors='pt')
x
# logits = theme_model(x.to(device))[0]

# # we throw away "neutral" (dim 1) and take the probability of
# # "entailment" (2) as the probability of the label being true 
# entail_contradiction_logits = logits[:,[0,2]]
# probs = entail_contradiction_logits.softmax(dim=1)
# prob_label_is_true = probs[:,1]
# prob_label_is_true



{'input_ids': tensor([[    0,   500,  1949,   213,   449,  2013,   116, 50118,   100,    56,
            10, 21431, 24904,   626,    11,  1824,     4,  1308,   284,     8,
            38,   439,     7,     5,   213,   449,  2013,  1349,   147,    51,
            56,    41, 33638,     8, 28445,  9668,     4,  2041,   137,    94,
            38,  4024,    10,   213,   449,  2013,     8,    38,  2145,    38,
          1705,    17,    27,    90,   269,  2842,     5,  1123, 26965,     8,
         20789,   142,     9,   141,  1359,   127,   124,    16,     6,    53,
           961,  1493,   198,   162,   115,  2842,    24,    95,  2051,    19,
            49, 15145, 18822,     4,    85,   938,    17,    27,    90,    14,
            38,    21,   765,     6,    24,    21,    14,    38,  1705,    17,
            27,    90, 20789,     4,  6233,  1268,  1493,   655,  2984,    42,
           116,  7698,    47,  3068,   213,   449,  7870,   114,    47,    17,
            27,   548,    56,    42,  

In [63]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [64]:
theme_trainer = setup_trainer('theme', dataset=ds_theme_encoded, model=theme_model, tokenizer=theme_tokenizer)

In [65]:
result = theme_trainer.train()
print_summary(result)

theme_trainer.save_model('fine-tuning-final/theme')



  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 3) + inhomogeneous part.