BERT for text classification

In [None]:
! pip install datasets transformers[torch] evaluate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from da

# Getting started

Downloading dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("gsarti/itacola")
dataset = dataset.rename_column('sentence', "text")
dataset = dataset.rename_column('acceptability', 'labels')

Downloading builder script:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.11k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7801 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/975 [00:00<?, ? examples/s]

Downloading model and tokenizer

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "dbmdz/bert-base-italian-xxl-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModel.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

# Encoding dataset
Starting by tokenizing

In [None]:
def tokenize(batch):
 return tokenizer(batch["text"], padding=True, truncation=True)

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)


Map:   0%|          | 0/7801 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

And encoding using downloaded BERT

In [None]:
import numpy as np
def forward_pass(batch):
    input_ids = torch.tensor(batch["input_ids"]).to(device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(device)
    with torch.no_grad():
        last_hidden_state = model(input_ids, attention_mask).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
    # Use average of unmasked hidden states for classification
    lhs_shape = last_hidden_state.shape
    boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
    boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
    boolean_mask = boolean_mask.reshape(lhs_shape)
    masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
    batch["hidden_state"] = masked_mean.data
    return batch


dataset_encoded = dataset_encoded.map(forward_pass, batched=True,
 batch_size=128)


Map:   0%|          | 0/7801 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

# Option 1: train classification model on top of BERT

In [None]:
import numpy as np
X_train = np.array(dataset_encoded["train"]["hidden_state"])
X_valid = np.array(dataset_encoded["test"]["hidden_state"])
y_train = np.array(dataset_encoded["train"]["labels"])
y_valid = np.array(dataset_encoded["test"]["labels"])
X_train.shape, X_valid.shape

((7801, 768), (975, 768))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.datasets import load_iris

# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Train the classifier on the training data
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_valid)

# Evaluate the classifier using classification_report
report = classification_report(y_valid, y_pred)

# Print the classification report
print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.18      0.29       154
           1       0.87      0.98      0.92       821

    accuracy                           0.86       975
   macro avg       0.77      0.58      0.60       975
weighted avg       0.84      0.86      0.82       975



# Option 2: Train a classification model and BERT parameters

Using HF wrapper

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-xxl-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

Defining training arguments and hyperparameters

In [None]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(output_dir="results",
    num_train_epochs=5,
    learning_rate=2e-5,
    metric_for_best_model="f1",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False)


Training

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4368,0.338705,0.874872,0.849997
2,0.3029,0.372773,0.892308,0.880938
3,0.1868,0.522931,0.892308,0.876771
4,0.1269,0.573438,0.90359,0.894051
5,0.0677,0.622536,0.904615,0.896588


TrainOutput(global_step=4880, training_loss=0.21906336526401707, metrics={'train_runtime': 274.6008, 'train_samples_per_second': 142.043, 'train_steps_per_second': 17.771, 'total_flos': 601326955917000.0, 'train_loss': 0.21906336526401707, 'epoch': 5.0})

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid,\
                            np.argmax(trainer.predict(dataset_encoded["test"]).predictions, axis = 1, out = None)
                            ))

              precision    recall  f1-score   support

           0       0.80      0.53      0.64       154
           1       0.92      0.97      0.95       821

    accuracy                           0.90       975
   macro avg       0.86      0.75      0.79       975
weighted avg       0.90      0.90      0.90       975

