In [1]:
import pandas as pd

finetuned_dirname = "bert-base-finetuned-liarpantsfire"
tvt_set = "tvt2_3"

data = pd.read_csv("../../data/processed/liarpantsfire_dataset.csv", lineterminator="\n")
data = data[['statement', tvt_set, 'label']]
print(data.shape)
data.head()

(12791, 3)


Unnamed: 0,statement,tvt2_3,label
0,Says the Annies List political group supports ...,training,false
1,When did the decline of coal start? It started...,training,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",validation,mostly-true
3,Health care reform legislation is likely to ma...,training,false
4,The economic turnaround started at the end of ...,training,half-true


In [2]:
import torch

class CustomTextDataset(torch.utils.data.dataset.Dataset):

    def __init__(self, texts, labels):
        self.labels = labels
        self.texts = texts
        self.attention_mask = None
        self.input_ids = None
        self.token_type_ids = None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {
            "text": self.texts[idx],
            "label": self.labels[idx],
            "attention_mask": self.attention_mask[idx] if self.attention_mask else None,
            "input_ids": self.input_ids[idx] if self.input_ids else None,
        }
        return sample
    
    def tokenize(self, tokenizer):
        self.attention_mask = []
        self.input_ids = []
        self.token_type_ids = []

        for text in self.texts:
            token = tokenizer(text, padding="max_length", truncation=True)
            
            self.attention_mask.append(token['attention_mask'])
            self.input_ids.append(token['input_ids'])

In [3]:
labels = []

labels_str = data['label'].unique().tolist()
labels = [labels_str.index(lstr) for lstr in data['label'].tolist()]
    
print(len(labels))
labels[:10]

12791


[0, 1, 2, 0, 1, 3, 4, 1, 1, 2]

In [4]:
train_dataset = CustomTextDataset(
    [d['statement'] for i, d in data.iterrows() if d[tvt_set] == 'training'],
    [labels[i] for i, d in data.iterrows() if d[tvt_set] == 'training'])
test_dataset = CustomTextDataset(
    [d['statement'] for i, d in data.iterrows() if d[tvt_set] == 'validation'],
    [labels[i] for i, d in data.iterrows() if d[tvt_set] == 'validation'])
# train_dataset[0]

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset.tokenize(tokenizer)
test_dataset.tokenize(tokenizer)

In [7]:
print(len(train_dataset))
print(len(test_dataset))

8598
2926


### Fine Tuning

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased",
                                                           output_hidden_states=False,
                                                           num_labels=6)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
from transformers import TrainingArguments

epochs = 10
batch_size = 8
save_steps = (round((len(train_dataset)/batch_size) + 0.49)) * epochs
# save_steps = 1_000_000

training_args = TrainingArguments(
    output_dir=f"../../data/models/{finetuned_dirname}",
    num_train_epochs=epochs,
    save_steps=save_steps,
    logging_steps=1000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)

print(f"Save Steps : {save_steps}")

Save Steps : 10750


In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
import time

start = time.time()

trainer.train()

print(f"Execution Time : {round(time.time() - start)} seconds")

***** Running training *****
  Num examples = 8598
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10750


Step,Training Loss
1000,1.7581
2000,1.6904
3000,1.4808
4000,1.0776
5000,0.679
6000,0.393
7000,0.2634
8000,0.1746
9000,0.1004
10000,0.0628


Saving model checkpoint to ../../data/models/bert-base-finetuned-liarpantsfire/checkpoint-10750
Configuration saved in ../../data/models/bert-base-finetuned-liarpantsfire/checkpoint-10750/config.json
Model weights saved in ../../data/models/bert-base-finetuned-liarpantsfire/checkpoint-10750/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




Execution Time : 3461 seconds


In [13]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2926
  Batch size = 8


{'eval_loss': 6.442332744598389,
 'eval_accuracy': 0.24948735475051265,
 'eval_runtime': 35.8542,
 'eval_samples_per_second': 81.608,
 'eval_steps_per_second': 10.208,
 'epoch': 10.0}