In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate

In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from google.colab import files
uploaded = files.upload()

Saving mbti_chunked_clean.csv to mbti_chunked_clean.csv


In [5]:
df = pd.read_csv('mbti_chunked_clean.csv', index_col=0)
df = df[['text', 'label']]
df.head()

Unnamed: 0_level_0,text,label
chunk_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,the pope is infallible this is a catholic dogm...,intj
2,martin said that george floyd was in heaven it...,intj
3,while supporting abortion lol abortion again i...,intj
4,views on predestination exist in the catholic ...,intj
5,a little kitty cat i mean the real dangerousne...,intj


In [6]:
labels = df['label'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

df['label'] = df['label'].map(label2id)

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

In [8]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True,
    padding=True)

dataset = dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/26544 [00:00<?, ? examples/s]

Map:   0%|          | 0/6636 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir="bert-mbti-output",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [12]:
trainer.train()

Step,Training Loss
500,2.5434
1000,2.5396
1500,2.5201
2000,2.5261
2500,2.532
3000,2.5142
3500,2.5253
4000,2.5206
4500,2.5175


TrainOutput(global_step=4977, training_loss=2.5258218567127537, metrics={'train_runtime': 7312.58, 'train_samples_per_second': 10.89, 'train_steps_per_second': 0.681, 'total_flos': 2.0954693241667584e+16, 'train_loss': 2.5258218567127537, 'epoch': 3.0})

In [13]:
results = trainer.evaluate()

In [14]:
predictions = trainer.predict(dataset['test'])

pred_labels = np.argmax(predictions.predictions, axis=1)

predicted_labels = [id2label[pred] for pred in pred_labels]

In [15]:
results

{'eval_loss': 2.517169952392578,
 'eval_accuracy': 0.15792646172393007,
 'eval_runtime': 181.9107,
 'eval_samples_per_second': 36.479,
 'eval_steps_per_second': 2.281,
 'epoch': 3.0}

In [16]:
predictions.metrics

{'test_loss': 2.517169952392578,
 'test_accuracy': 0.15792646172393007,
 'test_runtime': 181.6645,
 'test_samples_per_second': 36.529,
 'test_steps_per_second': 2.284}

In [19]:
pred_labels.mean()

np.float64(5.006027727546715)

In [20]:
pred_labels.sum()

np.int64(33220)

In [21]:
predicted_labels[:10]

['infp',
 'infp',
 'infp',
 'infp',
 'infp',
 'infp',
 'infp',
 'infp',
 'infp',
 'infp']

In [24]:
trainer.save_model("bert_mbticlassifier")
tokenizer.save_pretrained("bert_mbti_classifier")

('bert_mbti_classifier/tokenizer_config.json',
 'bert_mbti_classifier/special_tokens_map.json',
 'bert_mbti_classifier/vocab.txt',
 'bert_mbti_classifier/added_tokens.json',
 'bert_mbti_classifier/tokenizer.json')