# A full supervised fine-tuning using local data

## Load dataset

In [40]:
from datasets import load_dataset
import pandas as pd


## Split the dataset into train/validation/test

In [55]:
df = pd.read_csv("./output.csv")
# shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)
train_ratio, validation_ratio, test_raio = 0.8, 0.1, 0.1
total_size = len(df)
train_df = df[: int(total_size * train_ratio)]
validation_df = df[int(total_size * train_ratio) : int(total_size * (train_ratio + validation_ratio))]
test_df = df[int(total_size * (train_ratio + validation_ratio)) :]

train_df = pd.DataFrame(train_df)
validation_df = pd.DataFrame(validation_df)
test_df = pd.DataFrame(test_df)

train_df.to_csv("./data/train.csv", index=False)
validation_df.to_csv("./data/validation.csv", index=False)
test_df.to_csv("./data/test.csv", index=False)


raw_dataset = load_dataset("csv", data_files={"train": "./data/train.csv", "validation": "./data/validation.csv", "test": "./data/test.csv"})
raw_dataset


Downloading and preparing dataset csv/default to /tmp/jinh/huggingface/datasets/csv/default-6e4f0556d6300670/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /tmp/jinh/huggingface/datasets/csv/default-6e4f0556d6300670/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 38469
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4809
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4809
    })
})

## Another option is to use train_test_split for Dataset

In [41]:
_raw_dataset = load_dataset("csv", data_files="./output.csv")
print(_raw_dataset)
raw_dataset = _raw_dataset['train'].train_test_split(test_size=0.2, seed=42)
raw_dataset["train"] = raw_dataset.pop("train")
raw_dataset['validation'] = raw_dataset.pop("test")
_ds = raw_dataset['validation'].train_test_split(test_size=0.5, seed=42)
raw_dataset['test'] = _ds.pop("train")
raw_dataset['validation'] = _ds.pop("test")
print(raw_dataset)

Downloading and preparing dataset csv/default to /tmp/jinh/huggingface/datasets/csv/default-102b9fb8b70ba79c/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /tmp/jinh/huggingface/datasets/csv/default-102b9fb8b70ba79c/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 48087
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 38469
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4809
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4809
    })
})


In [42]:
from transformers import AutoTokenizer, DataCollatorWithPadding


ckp = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(ckp)

def tokenize_function(data):
    return tokenizer(data["text"], truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/38469 [00:00<?, ? examples/s]

Map:   0%|          | 0/4809 [00:00<?, ? examples/s]

Map:   0%|          | 0/4809 [00:00<?, ? examples/s]

In [43]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 38469
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4809
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4809
    })
})

In [44]:
# prepare the columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [45]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=64, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=64, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=64, collate_fn=data_collator
)


In [46]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([64]),
 'input_ids': torch.Size([64, 10]),
 'token_type_ids': torch.Size([64, 10]),
 'attention_mask': torch.Size([64, 10])}

In [47]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(ckp, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.5622, grad_fn=<NllLossBackward0>) torch.Size([64, 2])


In [49]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)




In [50]:
from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)


602


In [51]:
import torch

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(DEVICE)
print(DEVICE)


cuda


In [52]:
from tqdm.auto import tqdm
# reload the model
model = AutoModelForSequenceClassification.from_pretrained(ckp, num_labels=2)
model.to(DEVICE)
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/602 [00:00<?, ?it/s]

In [53]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    clf_metrics.add_batch(predictions=predictions, references=batch["labels"])

clf_metrics.compute()


  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.2561863173216885, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0}

In [None]:
# model.push_to_hub("bert-base-uncased-poseidon")


In [None]:
from evaluate import evaluator

task_eval = evaluator("text-classification")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
results = task_eval.compute(model_or_pipeline=model, 
                            data=raw_dataset['test'], tokenizer=tokenizer, 
                            label_mapping={"LABEL_0": 0, "LABEL_1": 1},
                            metric=clf_metrics,)    
results


In [None]:
# now go over a couple of different ckps
ckps = ["bert-base-uncased", 
        # "bert-base-cased", 
        # "distilbert-base-cased",
        # "distilbert-base-uncased",
        # "roberta-base",
        # "roberta-large",
        # "albert-base-v2",
        # "albert-large-v2",
        # "albert-xlarge-v2",
        # "albert-xxlarge-v2",
        # "distilbert-base-uncased-finetuned-sst-2-english",
        # "distilbert-base-cased-finetuned-mrpc",
        # "roberta-base-openai-detector",
        ]
ckps_res = {}
for ckp in ckps:
    model = AutoModelForSequenceClassification.from_pretrained(ckp, num_labels=2)
    results = task_eval.compute(model_or_pipeline=model, 
                                data=raw_dataset['test'], tokenizer=tokenizer, 
                                label_mapping={"LABEL_0": 0, "LABEL_1": 1},
                                metric=clf_metrics,)    
    ckps_res[ckp] = results

In [27]:
model.save_pretrained("./test-trainer-sft/")