In [35]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class TextClassificationDataset(Dataset): 
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx, device="cpu"):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, 
                                  padding='max_length', 
                                  max_length=self.max_length, 
                                  truncation=True, 
                                  return_tensors="pt").to(device)
        return {'input_ids': encoding['input_ids'].flatten(), 
                'attention_mask': encoding['attention_mask'].flatten(), 
                'token_type_ids': encoding['token_type_ids'].flatten(), 
                'label': torch.tensor(label)}

In [39]:
label_mapping = {
    "Thinking at the Margin": 0, 
    "Counterfactual": 1, 
    "General Equilibrium": 2
}

dataset_path = "data/econ-concepts/econ-concepts-30.csv"
df = pd.read_csv(dataset_path)

texts = [i for i in df['sentence']]
labels = [label_mapping[i] for i in df['label']]
model_name_or_path = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", use_fast=True)
train_dataset = TextClassificationDataset(texts, labels, tokenizer)
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [22]:
from utils import TestDataset, save_to_csv
from tqdm import tqdm
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
device = "mps"

print("----- Load Tokenizer and Model -----")
model_name_or_path = "bert-base-uncased"
model_name = 'bert'
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, 
                                                           num_labels=3, 
                                                           torch_dtype=torch.bfloat16, 
                                                           device_map=device)

----- Load Tokenizer and Model -----


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
learning_rate = 1e-5
epochs = 3
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [43]:
from torch import nn

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        print(loss)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'output_models/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)