In [1]:
import pandas as pd
import pickle
from datasets import Dataset
import torch
from datasets import DatasetDict
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import TrainingArguments, Trainer
import numpy as np

In [2]:
train_df = pd.read_pickle("train_dataset")
test_df = pd.read_pickle("test_dataset")
val_df = pd.read_pickle("val_dataset")

In [3]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df),
    'unsupervised': Dataset.from_pandas(val_df)
})

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 427
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
})

In [5]:
# define preprocess function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [7]:
tokenized_data = dataset.map(preprocess_function, batched=True, batch_size=100,  load_from_cache_file=True)

Map:   0%|          | 0/427 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

In [8]:
tokenized_data['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
train_df['label'].unique()

array([4, 5, 6, 0, 3, 1, 2])

In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [14]:
training_args = TrainingArguments(
    output_dir="./bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [16]:
torch.cuda.empty_cache()

In [17]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.058416,0.663043
2,No log,0.900002,0.728261
3,No log,0.9019,0.73913


TrainOutput(global_step=321, training_loss=0.9947591526114681, metrics={'train_runtime': 90.7826, 'train_samples_per_second': 14.111, 'train_steps_per_second': 3.536, 'total_flos': 37917444116700.0, 'train_loss': 0.9947591526114681, 'epoch': 3.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.9000019431114197,
 'eval_accuracy': 0.7282608695652174,
 'eval_runtime': 1.298,
 'eval_samples_per_second': 70.878,
 'eval_steps_per_second': 17.72,
 'epoch': 3.0}

In [19]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[ 0.7878391 , -1.258767  , -1.5960697 ,  0.36126748,  0.7405478 ,
         1.1216418 , -0.6225681 ],
       [ 1.2508624 , -0.7931418 , -1.1653047 ,  0.72245973, -0.07325275,
         0.42333713,  0.15723583],
       [-0.8321402 , -0.8643484 , -1.0326939 , -1.441635  ,  3.9641166 ,
        -0.8170152 , -1.3922852 ],
       [ 1.182149  , -0.7081512 , -1.0928639 ,  0.7637488 , -0.16759628,
         0.41758573,  0.24975197],
       [ 0.34631094, -1.2349015 , -1.7398167 ,  0.06866408,  1.369632  ,
         1.2471882 , -0.7942375 ],
       [ 0.01018636, -0.79878426, -1.9373285 , -0.6963433 ,  3.3698947 ,
        -0.8620731 , -0.59399843],
       [-0.6739929 , -0.779163  , -1.3461268 , -1.2756752 ,  4.1001415 ,
        -0.87025493, -1.3476105 ],
       [ 0.6565807 , -1.1635976 , -1.0676225 ,  0.4871364 , -0.320235  ,
         1.8921331 , -0.6963069 ],
       [-0.901357  , -0.66050756, -1.2174878 , -1.3868093 ,  4.0716453 ,
        -0.853609  , -1.3524983 ],

In [20]:
training_args = TrainingArguments(
    output_dir="./bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [22]:
torch.cuda.empty_cache()

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.87489,0.75
2,No log,0.911776,0.717391
3,No log,0.932546,0.717391


TrainOutput(global_step=162, training_loss=0.4996317168812693, metrics={'train_runtime': 79.2057, 'train_samples_per_second': 16.173, 'train_steps_per_second': 2.045, 'total_flos': 52204193546700.0, 'train_loss': 0.4996317168812693, 'epoch': 3.0})

In [24]:
trainer.evaluate()

{'eval_loss': 0.8748903870582581,
 'eval_accuracy': 0.75,
 'eval_runtime': 1.498,
 'eval_samples_per_second': 61.415,
 'eval_steps_per_second': 8.011,
 'epoch': 3.0}

In [25]:
# trainer.save_model('./baseline_bert')

In [26]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[ 0.34760496, -1.5118971 , -1.6656618 ,  0.24111393,  0.25556564,
         2.2733667 , -0.8934638 ],
       [ 1.2248296 , -0.9374818 , -1.2601331 ,  0.6985131 , -0.5656925 ,
         0.4706643 ,  0.56793666],
       [-0.91883755, -0.17711115, -1.5839896 , -1.5359768 ,  4.4879656 ,
        -1.1382575 , -1.2468565 ],
       [ 1.3300622 , -0.83188516, -1.2226257 ,  0.8510202 , -0.732101  ,
         0.30388665,  0.7443369 ],
       [-0.16029456, -1.2254049 , -1.8191245 , -0.27321234,  1.4364688 ,
         2.0361614 , -1.116059  ],
       [-0.30644032, -0.30896565, -2.0040507 , -1.1752982 ,  4.0050015 ,
        -1.08024   , -0.802896  ],
       [-0.7669351 , -0.23882994, -1.6607765 , -1.4821094 ,  4.435495  ,
        -1.1536027 , -1.2831417 ],
       [ 0.47533256, -1.3302677 , -1.2625556 ,  0.5623014 , -0.74744153,
         2.447754  , -0.5985521 ],
       [-1.0198313 , -0.06109704, -1.5908053 , -1.4989396 ,  4.3645525 ,
        -1.097253  , -1.2201146 ],