In [8]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
label_mapping = {
    "Thinking at the Margin": 0, 
    "Counterfactual": 1, 
    "General Equilibrium": 2
}
df = pd.read_csv("data/econ-concepts/econ-concepts-50.csv")

# label_mapping = {
#     "neutral": 0, 
#     "positive": 1, 
#     "negative": 2
# }
# df = pd.read_csv("data/train.csv")


df['label'] = df['label'].map(label_mapping)
df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=0)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'], test_size=0.1, random_state=0)

In [10]:
model_name_or_path = "bert-base-uncased"
model_name = 'bert'
device = 'mps'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path, num_labels=3, torch_dtype=torch.bfloat16, device_map=device
        )


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=256), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=256), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length' , max_length=256), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 121/121 [00:00<00:00, 2576.55 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 2771.13 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 2523.14 examples/s]


In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'output_models/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=2,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                  
        args=args,          
        train_dataset=dataset_train,       
        eval_dataset=dataset_val,      
        compute_metrics=compute_metrics
)



In [13]:
trainer.train()

 20%|██        | 31/155 [00:28<01:38,  1.26it/s]
 20%|██        | 31/155 [00:29<01:38,  1.26it/s]

{'eval_loss': 0.9871651530265808, 'eval_accuracy': 0.42857142857142855, 'eval_runtime': 1.0166, 'eval_samples_per_second': 13.771, 'eval_steps_per_second': 6.885, 'epoch': 1.0}


 40%|████      | 62/155 [01:02<01:13,  1.27it/s]
 40%|████      | 62/155 [01:03<01:13,  1.27it/s]

{'eval_loss': 0.8392857313156128, 'eval_accuracy': 0.5, 'eval_runtime': 0.9596, 'eval_samples_per_second': 14.589, 'eval_steps_per_second': 7.295, 'epoch': 2.0}


 60%|██████    | 93/155 [01:34<00:49,  1.26it/s]
 60%|██████    | 93/155 [01:35<00:49,  1.26it/s]

{'eval_loss': 0.7979910969734192, 'eval_accuracy': 0.5714285714285714, 'eval_runtime': 1.2615, 'eval_samples_per_second': 11.098, 'eval_steps_per_second': 5.549, 'epoch': 3.0}


 80%|████████  | 124/155 [02:13<00:31,  1.02s/it]
 80%|████████  | 124/155 [02:14<00:31,  1.02s/it]

{'eval_loss': 0.7912946343421936, 'eval_accuracy': 0.6428571428571429, 'eval_runtime': 1.0525, 'eval_samples_per_second': 13.302, 'eval_steps_per_second': 6.651, 'epoch': 4.0}


100%|██████████| 155/155 [02:59<00:00,  1.24s/it]
100%|██████████| 155/155 [03:03<00:00,  1.24s/it]

{'eval_loss': 0.7912946343421936, 'eval_accuracy': 0.5714285714285714, 'eval_runtime': 1.8562, 'eval_samples_per_second': 7.542, 'eval_steps_per_second': 3.771, 'epoch': 5.0}


100%|██████████| 155/155 [03:05<00:00,  1.20s/it]

{'train_runtime': 185.6468, 'train_samples_per_second': 3.259, 'train_steps_per_second': 0.835, 'train_loss': 0.8922379032258064, 'epoch': 5.0}





TrainOutput(global_step=155, training_loss=0.8922379032258064, metrics={'train_runtime': 185.6468, 'train_samples_per_second': 3.259, 'train_steps_per_second': 0.835, 'total_flos': 79591808862720.0, 'train_loss': 0.8922379032258064, 'epoch': 5.0})

In [14]:
model.eval()
trainer.predict(dataset_test).metrics

100%|██████████| 8/8 [00:01<00:00,  5.96it/s]


{'test_loss': 0.9234374761581421,
 'test_accuracy': 0.3333333333333333,
 'test_runtime': 1.6782,
 'test_samples_per_second': 8.938,
 'test_steps_per_second': 4.767}