In [1]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2


In [2]:
!pip install -r requirements.txt

Collecting scikit-learn==1.4.0 (from -r requirements.txt (line 1))
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting nltk==3.8.1 (from -r requirements.txt (line 2))
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting matplotlib==3.8.3 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting bs4==0.0.2 (from -r requirements.txt (line 4))
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting classla (from -r requirements.txt (line 5))
  Downloading classla-2.1.1-py3-none-any.whl.metadata (20 kB)
Collecting ipykernel==6.29.3 (from -r requirements.txt (line 6))
  Downloading ipykernel-6.29.3-py3-none-any.whl.metadata (6.3 kB)
Collecting pandas==2.2.1 (from -r requirements.txt (line 7))
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1

In [1]:
import pandas as pd
import pickle
from datasets import Dataset
import torch
from datasets import DatasetDict
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import TrainingArguments, Trainer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_df = pd.read_pickle("train_dataset")
test_df = pd.read_pickle("test_dataset")
val_df = pd.read_pickle("val_dataset")

In [6]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df),
    'unsupervised': Dataset.from_pandas(val_df)
})

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 427
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 92
    })
})

In [8]:
# define preprocess function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [26]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")



In [27]:
tokenized_data = dataset.map(preprocess_function, batched=True, batch_size=100,  load_from_cache_file=True)

Map: 100%|██████████| 427/427 [00:00<00:00, 2661.76 examples/s]
Map: 100%|██████████| 92/92 [00:00<00:00, 1734.53 examples/s]
Map: 100%|██████████| 92/92 [00:00<00:00, 2916.89 examples/s]


In [28]:
tokenized_data['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [29]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
accuracy = evaluate.load("accuracy")

In [31]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [32]:
train_df['label'].unique()

array([4, 5, 6, 0, 3, 1, 2], dtype=int32)

In [33]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
BATCH_SIZE = 8
EPOCHS = 3
training_args = TrainingArguments(
    output_dir="./bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [36]:
torch.cuda.empty_cache()

In [37]:
trainer.train()

 33%|███▎      | 54/162 [01:43<02:50,  1.58s/it]
 33%|███▎      | 54/162 [01:52<02:50,  1.58s/it]

{'eval_loss': 1.084995150566101, 'eval_accuracy': 0.6630434782608695, 'eval_runtime': 8.7663, 'eval_samples_per_second': 10.495, 'eval_steps_per_second': 1.369, 'epoch': 1.0}


 67%|██████▋   | 108/162 [03:55<01:00,  1.11s/it]
 67%|██████▋   | 108/162 [04:02<01:00,  1.11s/it]

{'eval_loss': 0.8796781301498413, 'eval_accuracy': 0.75, 'eval_runtime': 7.1179, 'eval_samples_per_second': 12.925, 'eval_steps_per_second': 1.686, 'epoch': 2.0}


100%|██████████| 162/162 [06:20<00:00,  1.81s/it]
100%|██████████| 162/162 [06:28<00:00,  1.81s/it]

{'eval_loss': 0.8326335549354553, 'eval_accuracy': 0.7608695652173914, 'eval_runtime': 7.5144, 'eval_samples_per_second': 12.243, 'eval_steps_per_second': 1.597, 'epoch': 3.0}


100%|██████████| 162/162 [06:30<00:00,  2.41s/it]

{'train_runtime': 390.5799, 'train_samples_per_second': 3.28, 'train_steps_per_second': 0.415, 'train_loss': 1.1228936277789834, 'epoch': 3.0}





TrainOutput(global_step=162, training_loss=1.1228936277789834, metrics={'train_runtime': 390.5799, 'train_samples_per_second': 3.28, 'train_steps_per_second': 0.415, 'total_flos': 50810978521350.0, 'train_loss': 1.1228936277789834, 'epoch': 3.0})

In [38]:
trainer.evaluate()

100%|██████████| 12/12 [00:07<00:00,  1.66it/s]


{'eval_loss': 0.8326335549354553,
 'eval_accuracy': 0.7608695652173914,
 'eval_runtime': 8.0426,
 'eval_samples_per_second': 11.439,
 'eval_steps_per_second': 1.492,
 'epoch': 3.0}

In [39]:
trainer.predict(tokenized_data['unsupervised'])

100%|██████████| 12/12 [00:06<00:00,  1.76it/s]


PredictionOutput(predictions=array([[ 1.67214051e-01, -1.33009374e-01, -1.81199908e+00,
         1.89175606e-02,  2.44136244e-01,  1.70999920e+00,
        -8.51679325e-01],
       [ 8.74340653e-01, -1.24298096e+00, -1.34070504e+00,
         9.01062340e-02,  7.50245810e-01,  4.83256429e-01,
        -1.93972498e-01],
       [-8.24218392e-01, -5.16457140e-01, -1.14895308e+00,
        -6.75278664e-01,  4.27939320e+00, -6.05731726e-01,
        -7.36461639e-01],
       [-3.17297801e-02, -7.38250732e-01, -1.34872317e+00,
         7.40337491e-01, -4.16370153e-01,  1.69975173e+00,
         3.23330909e-01],
       [ 1.03139445e-01, -6.49575531e-01, -2.02592158e+00,
         6.50595725e-02,  1.45530283e+00,  1.32096744e+00,
        -9.62691307e-01],
       [ 1.91293135e-01, -6.71108961e-01, -1.36205482e+00,
        -6.35987878e-01,  3.56109667e+00, -6.62697911e-01,
        -9.05435622e-01],
       [-6.31930351e-01, -3.81327540e-01, -1.31442297e+00,
        -7.21978188e-01,  4.21775818e+00, -7.123

In [20]:
training_args = TrainingArguments(
    output_dir="./bert_runs_2",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [22]:
torch.cuda.empty_cache()

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.87489,0.75
2,No log,0.911776,0.717391
3,No log,0.932546,0.717391


TrainOutput(global_step=162, training_loss=0.4996317168812693, metrics={'train_runtime': 79.2057, 'train_samples_per_second': 16.173, 'train_steps_per_second': 2.045, 'total_flos': 52204193546700.0, 'train_loss': 0.4996317168812693, 'epoch': 3.0})

In [24]:
trainer.evaluate()

{'eval_loss': 0.8748903870582581,
 'eval_accuracy': 0.75,
 'eval_runtime': 1.498,
 'eval_samples_per_second': 61.415,
 'eval_steps_per_second': 8.011,
 'epoch': 3.0}

In [25]:
trainer.save_model('./baseline_bert_3_8_base_cased')

In [26]:
trainer.predict(tokenized_data['unsupervised'])

PredictionOutput(predictions=array([[ 0.34760496, -1.5118971 , -1.6656618 ,  0.24111393,  0.25556564,
         2.2733667 , -0.8934638 ],
       [ 1.2248296 , -0.9374818 , -1.2601331 ,  0.6985131 , -0.5656925 ,
         0.4706643 ,  0.56793666],
       [-0.91883755, -0.17711115, -1.5839896 , -1.5359768 ,  4.4879656 ,
        -1.1382575 , -1.2468565 ],
       [ 1.3300622 , -0.83188516, -1.2226257 ,  0.8510202 , -0.732101  ,
         0.30388665,  0.7443369 ],
       [-0.16029456, -1.2254049 , -1.8191245 , -0.27321234,  1.4364688 ,
         2.0361614 , -1.116059  ],
       [-0.30644032, -0.30896565, -2.0040507 , -1.1752982 ,  4.0050015 ,
        -1.08024   , -0.802896  ],
       [-0.7669351 , -0.23882994, -1.6607765 , -1.4821094 ,  4.435495  ,
        -1.1536027 , -1.2831417 ],
       [ 0.47533256, -1.3302677 , -1.2625556 ,  0.5623014 , -0.74744153,
         2.447754  , -0.5985521 ],
       [-1.0198313 , -0.06109704, -1.5908053 , -1.4989396 ,  4.3645525 ,
        -1.097253  , -1.2201146 ],