# Preparing the dataset

Pulling manual data classification and placing it into train-validate datasets

In [1]:
import pandas as pd

df_chi = pd.read_csv('data/chi.csv')

In [2]:
df_chi['selftext'] = df_chi['selftext'].fillna('')
df_chi['text'] = df_chi['title'] + '\n' + df_chi['selftext']

In [3]:
manual_bool = ~df_chi.loc[:, 'negative'].str.startswith('0.')
df_manual = df_chi.loc[manual_bool]

In [4]:
df_manual.loc[:, ['tone', 'emotion', 'theme']].describe()

Unnamed: 0,tone,emotion,theme
count,26,26,26
unique,3,5,5
top,neutral,fear,question
freq,16,19,16


In [5]:
tone_labels = ['negative', 'neutral', 'positive']
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']

In [6]:
from sklearn.model_selection import train_test_split

def train_test_val_split(df, train_size, val_size, test_size, random_state=42):
    """
    Splits a pandas dataframe into training, validation, and test sets.

    Args:
    - df: pandas dataframe to split.
    - train_size: float between 0 and 1 indicating the proportion of the dataframe to include in the training set.
    - val_size: float between 0 and 1 indicating the proportion of the dataframe to include in the validation set.
    - test_size: float between 0 and 1 indicating the proportion of the dataframe to include in the test set.
    - random_state: int or None, optional (default=42). The seed used by the random number generator.

    Returns:
    - train_df: pandas dataframe containing the training set.
    - val_df: pandas dataframe containing the validation set.
    - test_df: pandas dataframe containing the test set.

    Raises:
    - AssertionError: if the sum of train_size, val_size, and test_size is not equal to 1.
    """

    assert train_size + val_size + test_size == 1, "Train, validation, and test sizes must add up to 1."
    
    # Split the dataframe into training and test sets
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # Calculate the size of the validation set relative to the original dataframe
    val_ratio = val_size / (1 - test_size)
    
    # Split the training set into training and validation sets
    train_df, val_df = train_test_split(train_df, test_size=val_ratio, random_state=random_state)
    
    return train_df, val_df, test_df

In [7]:
cols = ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme']
train_df, val_df, test_df = train_test_val_split(df_manual.loc[:, cols], 0.8, 0.1, 0.1)

In [8]:
manual_path_train = "data/fine-tune/chi_train.jsonl"
manual_path_validate = "data/fine-tune/chi_validate.jsonl"
manual_path_test = "data/fine-tune/chi_test.jsonl"

train_df.to_json(manual_path_train, orient="records", lines=True)
val_df.to_json(manual_path_validate, orient="records", lines=True)
test_df.to_json(manual_path_test, orient="records", lines=True)

In [9]:
from datasets import load_dataset

data_files = {
    'train': manual_path_train,
    'validate': manual_path_validate,
    'test': manual_path_test
}

ds = load_dataset("json", data_files=data_files)

Downloading and preparing dataset json/default to /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validate split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
    test: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
})

In [17]:
ds['train'][0]

{'id': '13vs887',
 'created_utc': '5/30/23 14:44',
 'text': 'Ride go kart?\nI had a spinal fusion done in 2010. My family and I went to the go kart track where they had an arcade and amusement rides. Year before last I drove a go kart and I remember I couldn’t really touch the gas pedal and bend because of how straight my back is, but everyone else around me could touch it just fine with their knees bent. It wasn’t that I was short, it was that I couldn’t bend. Has anyone else ever experienced this? Should you ride go karts if you’ve had this type of surgery?',
 'tone': 'neutral',
 'emotion': 'surprise',
 'theme': 'question'}

In [106]:
from datasets import Dataset, ClassLabel

theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']
num_labels = len(theme_labels)
class_labels = ClassLabel(num_classes=num_labels, names=theme_labels)
template="This example is {}."

def preprocess_func(row):
    text = row['text']
    themes = row['theme']
    return {'premise': text, 'hypothesis': [template.format(theme) for theme in themes]}

def isolate_dataset(ds: Dataset, feature: str):
    cols = ds.column_names['train']
    col_keep = {'text', feature}
    
    ds_filter = ds.remove_columns(col_keep.symmetric_difference(cols))
    ds_filter = ds_filter.map(preprocess_func, batched=True, remove_columns=['text'])
    ds_filter = ds_filter.cast_column(feature, class_labels)
    ds_filter = ds_filter.rename_column(feature, 'label')

    return ds_filter

feature = "theme"
ds_theme = isolate_dataset(ds, feature)
ds_theme

Loading cached processed dataset at /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-84485d320d408117.arrow
Loading cached processed dataset at /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-b7a755d0ff6a290d.arrow
Loading cached processed dataset at /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-191d3ab6eea49650.arrow
Loading cached processed dataset at /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d210e1044049960.arrow
Loading cached processed dataset at /Users/stefan/.cache/huggingface/datasets/json/default-51b231f7ef3f4635/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f

DatasetDict({
    train: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 3
    })
    test: Dataset({
        features: ['label', 'premise', 'hypothesis'],
        num_rows: 3
    })
})

In [107]:
print(ds_theme['train'].features['theme'])
print(ds_theme['train'][0])

KeyError: 'theme'

In [108]:
from transformers import AutoTokenizer
import os

os.environ["TOKENIZERS_PARALLELISM"]="True"

model_checkpoint = "facebook/bart-large-mnli"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_func(row):
    return tokenizer(row['premise'], row['hypothesis'], truncation=True, padding=True)

encoded_dataset = ds_theme.map(preprocess_func, batched=True)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [109]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'premise', 'hypothesis', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['label', 'premise', 'hypothesis', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
    test: Dataset({
        features: ['label', 'premise', 'hypothesis', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [110]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{feature}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    # per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    use_mps_device=True
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset['validate'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [111]:
trainer.train()



  0%|          | 0/15 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 3) + inhomogeneous part.

## Setup model and trainer

In [57]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def init_model(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    return (tokenizer, config, model)

In [46]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [47]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

In [48]:
from transformers import Trainer, TrainingArguments, logging
from datasets import Dataset

logging.set_verbosity_error()

def setup_trainer(name: str, dataset: Dataset, model, tokenizer):
    logging_steps = len(dataset['train'])
    model_name = f"fine-tuning-chkp/{name}"

    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        disable_tqdm=False,
        logging_steps=logging_steps,
        log_level="error",
        use_mps_device=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validate'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    return trainer

# Theme fine-tuning

In [53]:
feature = 'theme'
cols = ds.column_names['train']
col_keep = {'text', feature}

ds_theme = ds.remove_columns(col_keep.symmetric_difference(cols))
ds_theme = ds_theme.rename_column(feature, 'label')

In [60]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import random


theme_tokenizer, theme_config, theme_model = init_model("facebook/bart-large-mnli")
# Linear(in_features=1024, out_features=3, bias=True)
# {0: 'contradiction', 1: 'neutral', 2: 'entailment'}

theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']
num_labels = len(theme_labels)
template="This example is {}."

def create_input_sequence(sample):
    text = sample['text']
    label = sample['label'][0]
    contradiction_labels = theme_labels[:]
    label_idx = contradiction_labels.index(label)
    contradiction_labels.pop(label_idx)

    encoded_sequence = theme_tokenizer(
        text,
        [template.format(label)],
        # max_length=512,
        # padding='max_length', 
        truncation=True, 
        return_tensors='pt'
    )
    encoded_sequence['labels'] = [2]
    encoded_sequence['input_sentence'] = theme_tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

ds_theme_encoded = ds_theme.map(
    create_input_sequence, 
    batched=True, 
    batch_size=1,
    remove_columns=["label", "text"]
)

ds_theme_encoded


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
})

In [61]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [62]:
premise = ds_theme['train'][0]['text']
template= "This example is {}."
hypothesis = template.format(ds_theme['train'][0]['label'])

# run through model pre-trained on MNLI
x = theme_tokenizer(premise, hypothesis, 
                           truncation_strategy='only_first',
        return_tensors='pt')
x
# logits = theme_model(x.to(device))[0]

# # we throw away "neutral" (dim 1) and take the probability of
# # "entailment" (2) as the probability of the label being true 
# entail_contradiction_logits = logits[:,[0,2]]
# probs = entail_contradiction_logits.softmax(dim=1)
# prob_label_is_true = probs[:,1]
# prob_label_is_true



{'input_ids': tensor([[    0,   500,  1949,   213,   449,  2013,   116, 50118,   100,    56,
            10, 21431, 24904,   626,    11,  1824,     4,  1308,   284,     8,
            38,   439,     7,     5,   213,   449,  2013,  1349,   147,    51,
            56,    41, 33638,     8, 28445,  9668,     4,  2041,   137,    94,
            38,  4024,    10,   213,   449,  2013,     8,    38,  2145,    38,
          1705,    17,    27,    90,   269,  2842,     5,  1123, 26965,     8,
         20789,   142,     9,   141,  1359,   127,   124,    16,     6,    53,
           961,  1493,   198,   162,   115,  2842,    24,    95,  2051,    19,
            49, 15145, 18822,     4,    85,   938,    17,    27,    90,    14,
            38,    21,   765,     6,    24,    21,    14,    38,  1705,    17,
            27,    90, 20789,     4,  6233,  1268,  1493,   655,  2984,    42,
           116,  7698,    47,  3068,   213,   449,  7870,   114,    47,    17,
            27,   548,    56,    42,  

In [63]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [64]:
theme_trainer = setup_trainer('theme', dataset=ds_theme_encoded, model=theme_model, tokenizer=theme_tokenizer)

In [65]:
result = theme_trainer.train()
print_summary(result)

theme_trainer.save_model('fine-tuning-final/theme')



  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 3) + inhomogeneous part.