In [8]:
import pandas as pd
import pickle
from datasets import Dataset
import torch
from datasets import DatasetDict
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import TrainingArguments, Trainer
import numpy as np

In [9]:
train_df = pd.read_pickle("train_dataset")
test_df = pd.read_pickle("test_dataset")
val_df = pd.read_pickle("val_dataset")

In [10]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df),
    'unsupervised': Dataset.from_pandas(val_df)
})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 427
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
})

In [12]:
# define preprocess function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
tokenized_data = dataset.map(preprocess_function, batched=True, batch_size=100,  load_from_cache_file=True)

Map:   0%|          | 0/427 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

In [15]:
tokenized_data['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'attention_mask'])

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
from datasets import load_metric

In [18]:
accuracy = evaluate.load("accuracy")

In [31]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
train_df['label'].unique()

array([4, 5, 6, 0, 3, 1, 2])

In [21]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=7)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.

In [32]:
training_args = TrainingArguments(
    output_dir="./distilbert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [34]:
torch.cuda.empty_cache()

In [35]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.883033,0.728261
2,No log,0.914399,0.771739
3,No log,0.900245,0.695652
4,No log,1.013257,0.73913
5,0.332200,1.045408,0.75
6,0.332200,1.0023,0.706522
7,0.332200,1.060403,0.728261
8,0.332200,1.090765,0.706522
9,0.332200,1.086466,0.728261
10,0.045100,1.08844,0.73913


TrainOutput(global_step=1070, training_loss=0.1775391665574546, metrics={'train_runtime': 650.9264, 'train_samples_per_second': 6.56, 'train_steps_per_second': 1.644, 'total_flos': 62980409671032.0, 'train_loss': 0.1775391665574546, 'epoch': 10.0})

In [36]:
trainer.evaluate()

{'eval_loss': 0.8830333948135376,
 'eval_accuracy': 0.7282608695652174,
 'eval_runtime': 1.8903,
 'eval_samples_per_second': 48.669,
 'eval_steps_per_second': 12.167,
 'epoch': 10.0}

In [37]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[ 8.19581628e-01, -1.10499191e+00, -1.73721528e+00,
        -9.95667756e-01,  1.13719308e+00,  6.58469141e-01,
        -7.98225164e-01],
       [ 7.65956998e-01, -1.18076384e+00, -1.61069608e+00,
         6.05445921e-01, -1.44119704e+00,  7.81081438e-01,
         1.90633252e-01],
       [-1.71018815e+00, -8.98615241e-01, -1.92363560e+00,
        -1.63533616e+00,  5.23409653e+00, -1.00817466e+00,
        -1.26077616e+00],
       [ 6.84688449e-01, -1.39234877e+00, -1.78660166e+00,
         8.42889428e-01, -1.65783477e+00, -1.86614558e-01,
         1.42182934e+00],
       [-1.38735676e+00, -1.08925402e+00, -2.07891822e+00,
        -1.56481481e+00,  4.58104038e+00, -9.55543935e-01,
        -1.17164946e+00],
       [-1.04891586e+00, -8.47654760e-01, -1.94182754e+00,
        -1.63626528e+00,  4.91216326e+00, -1.40147328e+00,
        -1.25044954e+00],
       [-1.63912427e+00, -9.02970791e-01, -1.94918656e+00,
        -1.63139808e+00,  5.24019098e+00, -1.221

In [42]:
training_args = TrainingArguments(
    output_dir="./distil_bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [44]:
torch.cuda.empty_cache()

In [45]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.026504,0.76087
2,No log,1.019721,0.728261
3,No log,1.171335,0.73913
4,No log,1.159282,0.75
5,No log,1.117578,0.76087
6,No log,1.163649,0.76087
7,No log,1.151528,0.771739
8,No log,1.152599,0.771739
9,No log,1.169202,0.76087
10,0.032100,1.17549,0.771739


TrainOutput(global_step=540, training_loss=0.03005291309069704, metrics={'train_runtime': 1088.7276, 'train_samples_per_second': 3.922, 'train_steps_per_second': 0.496, 'total_flos': 87477678680640.0, 'train_loss': 0.03005291309069704, 'epoch': 10.0})

In [46]:
trainer.evaluate()

{'eval_loss': 1.0197209119796753,
 'eval_accuracy': 0.7282608695652174,
 'eval_runtime': 2.821,
 'eval_samples_per_second': 32.613,
 'eval_steps_per_second': 4.254,
 'epoch': 10.0}

In [47]:
# trainer.save_model('./baseline_bert')

In [48]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[-1.1097195 , -0.05226975, -2.020295  , -2.8036165 ,  4.4589553 ,
        -1.2261573 , -2.2038498 ],
       [ 2.8640869 , -1.3433659 , -1.986861  , -1.214355  , -1.8009548 ,
        -1.4943225 ,  1.4023286 ],
       [-2.5152786 , -0.41850334, -2.677857  , -2.5062737 ,  6.405673  ,
        -1.8712572 , -2.1137247 ],
       [-0.64856005, -1.929003  , -2.5537186 , -0.22607194, -1.7464392 ,
        -1.7971905 ,  4.531675  ],
       [-2.3273907 , -0.49252874, -2.6263008 , -2.4554267 ,  5.9628143 ,
        -1.8877883 , -2.0911787 ],
       [-1.9706107 , -0.22839545, -2.6122904 , -2.5579965 ,  6.1359496 ,
        -2.2168515 , -2.2352462 ],
       [-2.428191  , -0.36177745, -2.6605713 , -2.5255802 ,  6.3705664 ,
        -2.175966  , -1.9930347 ],
       [ 4.911891  , -1.147173  , -2.4363792 , -1.5436752 , -1.322907  ,
        -0.42364118, -1.7679462 ],
       [-2.5291698 , -0.5835171 , -2.748518  , -2.475103  ,  6.456353  ,
        -1.9081637 , -2.065145  ],

In [49]:
training_args = TrainingArguments(
    output_dir="./distil_bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


torch.cuda.empty_cache()

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.016338,0.771739
2,No log,1.055798,0.75
3,No log,1.07952,0.75
4,No log,1.153642,0.76087
5,No log,1.12583,0.771739
6,No log,1.165309,0.76087
7,No log,1.165533,0.76087
8,No log,1.236823,0.76087
9,No log,1.255918,0.76087
10,No log,1.263773,0.76087


TrainOutput(global_step=1350, training_loss=0.003303463641140196, metrics={'train_runtime': 7629.5758, 'train_samples_per_second': 2.798, 'train_steps_per_second': 0.177, 'total_flos': 567836946000816.0, 'train_loss': 0.003303463641140196, 'epoch': 50.0})

In [50]:
trainer.evaluate()

{'eval_loss': 1.0163379907608032,
 'eval_accuracy': 0.7717391304347826,
 'eval_runtime': 1.415,
 'eval_samples_per_second': 65.018,
 'eval_steps_per_second': 8.481,
 'epoch': 50.0}

In [51]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[-1.71418095e+00,  1.45288467e-01, -2.17820764e+00,
        -2.75425887e+00,  4.65305138e+00, -1.10951364e+00,
        -2.50145149e+00],
       [ 3.13558722e+00, -4.94920611e-01, -1.35527885e+00,
        -1.44257116e+00, -1.72042227e+00,  1.74847841e-02,
        -1.69353175e+00],
       [-2.63407540e+00, -6.25762641e-01, -2.86274624e+00,
        -2.61225533e+00,  6.69915581e+00, -1.90969884e+00,
        -2.15960217e+00],
       [ 1.02311730e+00, -2.18394279e+00, -3.18445253e+00,
         3.66851240e-01, -2.55460644e+00, -1.49540055e+00,
         2.78967214e+00],
       [-2.53571868e+00, -6.52096331e-01, -2.78568172e+00,
        -2.61654115e+00,  6.29765511e+00, -1.86655366e+00,
        -2.20532155e+00],
       [-2.16542006e+00, -2.88713604e-01, -2.79513288e+00,
        -2.67595482e+00,  6.46748829e+00, -2.34992743e+00,
        -2.33542991e+00],
       [-2.51889706e+00, -5.16637862e-01, -2.86141968e+00,
        -2.63834548e+00,  6.71627951e+00, -2.273