In [2]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments,DataCollatorWithPadding
from datasets import load_dataset, load_metric,Dataset
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
max_input_length = 300

bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 20)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
f1_score = load_metric("f1"); precision = load_metric("precision"); recall = load_metric("recall")

def tokenize(batch):
    return bert_tokenizer(batch['text'], truncation=True, max_length=max_input_length)

def compute_metrics(eval_pred):
    predictions, label = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return {"f1" :f1_score.compute(predictions = predictions, references = label, average = 'macro'),
            "precision" : precision.compute(predictions = predictions, references = label, average = 'macro'),
            "recall": recall.compute(predictions = predictions, references = label, average = 'macro')}

def hp_space(trial):
  return {"per_device_train_batch_size": trial.suggest_discrete_uniform("per_device_train_batch_size", 8,32,8)
        ,"learning_rate": trial.suggest_float("learning_rate", 0.00001,0.00005, log = True)
        ,"num_train_epochs": trial.suggest_int("num_train_epochs",1,10)}

def bert_init():
  return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 20)

In [5]:
import json

In [6]:
id2labels_name = "data_dict/id2label.json"

id2labels = load_dataset('json', data_files = id2labels_name)
id2labels = id2labels['train'][0]

Using custom data configuration default-6e8a664e7b4ced06
Reusing dataset json (/Users/max/.cache/huggingface/datasets/json/default-6e8a664e7b4ced06/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)
100%|██████████| 1/1 [00:00<00:00, 104.10it/s]


In [7]:
input_test = "data_dict/20news_test.json"
input_train = "data_dict/20news_train.json"

twenty_train = load_dataset('json', data_files = input_train)
twenty_test = load_dataset('json', data_files = input_test)

Using custom data configuration default-26d6ec63c10ffd87
Reusing dataset json (/Users/max/.cache/huggingface/datasets/json/default-26d6ec63c10ffd87/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)
100%|██████████| 1/1 [00:00<00:00, 282.86it/s]
Using custom data configuration default-5233bec689288346
Reusing dataset json (/Users/max/.cache/huggingface/datasets/json/default-5233bec689288346/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)
100%|██████████| 1/1 [00:00<00:00, 324.99it/s]


In [8]:
a = twenty_train['train'][0]
b = twenty_test['train'][0]

In [9]:
input_train = Dataset.from_dict(a)
input_test = Dataset.from_dict(b)

In [10]:
input_train = input_train.map(tokenize)
input_test = input_test.map(tokenize)

100%|██████████| 11314/11314 [00:47<00:00, 239.85ex/s]
100%|██████████| 11314/11314 [00:46<00:00, 243.81ex/s]


USING kfold FOR Cross validation

In [11]:
input_test = input_test.remove_columns(['text'])
input_train = input_train.remove_columns(['text'])

In [12]:
from sklearn.model_selection import KFold

fold = KFold(n_splits = 5)
this_set = fold.split(input_train)

In [13]:
import platform

platform.platform()

'macOS-12.4-arm64-arm-64bit'

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer, max_length= 300)

#default training arguments 
training_args = TrainingArguments(output_dir="./results", learning_rate=2e-5,  per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=5,weight_decay=0.01)
results = []

for i,j in this_set:
    train_set = input_train.select(i)
    eval_set = input_train.select(j)
    
#fine-tune
trainer = Trainer(
                model = bert_model, 
                args = training_args,
                train_dataset= train_set,
                eval_dataset= eval_set,
                tokenizer= bert_tokenizer,
                data_collator=data_collator,
                compute_metrics= compute_metrics)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [1]:
trainer.train(resume_from_checkpoint= True)

NameError: name 'trainer' is not defined

In [32]:
#Obtaining best hyperparameter setting, then finetuning based on the number of files.

eval_results = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2262
  Batch size = 16


In [None]:
eval_results