In [1]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from tensorboard import notebook
from sklearn.metrics import accuracy_score
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data

file_dict = {
  "train" : "data/coliee_train/coliee_2021.csv",
}
# for test also prepare same type of file as train

dataset = load_dataset(
  'csv',
  data_files=file_dict,
  delimiter=',',
  column_names=['label', 'premise', 'hypothesis', 'labels'],
  skiprows=1
)

print(dataset)
print(dataset['train'][0])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["premise"], examples['hypothesis'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_tokenized_datasets = tokenized_datasets['train'].select(range(750))
eval_tokenized_datasets = tokenized_datasets['train'].select(range(750, 805))

Found cached dataset csv (/Users/venkateshmurugadas/.cache/huggingface/datasets/csv/default-1077628d21b82835/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 1/1 [00:00<00:00, 206.65it/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'premise', 'hypothesis', 'labels', '__index_level_0__'],
        num_rows: 805
    })
})
{'label': 0, 'premise': "Article 5\n(1) A minor must obtain the consent of the minor's legal representative to perform a juridical act;provided, however, that this does not apply to a juridical act for merely acquiring a right or being released from an obligation.\n(2) A juridical act in contravention of the provisions of the preceding paragraph is voidable.\n(3) Notwithstanding the provisions of paragraph (1), a minor may freely dispose of property that the legal representative has permitted the minor to dispose of for a specified purpose, to an extent that falls within the scope of that purpose. The same applies if the minor disposes of property that the legal representative has permitted the minor to dispose of without specifying a purpose..", 'hypothesis': 'Acceptance made by a minor that received an offer of gifts without burden wi

Loading cached processed dataset at /Users/venkateshmurugadas/.cache/huggingface/datasets/csv/default-1077628d21b82835/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-2c89c1a971c919ff.arrow


In [3]:
train_tokenized_datasets

Dataset({
    features: ['label', 'premise', 'hypothesis', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 750
})

In [4]:
eval_tokenized_datasets

Dataset({
    features: ['label', 'premise', 'hypothesis', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 55
})

In [5]:
class BertConfig:
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    max_length = 128
    batch_size = 8
    learning_rate = 1e-6
    num_epochs = 3
    num_labels = 2
    warmup_steps = 300
    weight_decay = 0.01
    log_steps = 10

config = BertConfig()

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(config.device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [8]:


# Training the model
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    warmup_steps=config.warmup_steps,
    weight_decay=config.weight_decay,
    logging_dir='./logs',
    logging_steps=config.log_steps,  # Log every 10 steps
    evaluation_strategy='epoch',
    save_strategy='epoch',  # Save the model every epoch
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=eval_tokenized_datasets,
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()

  0%|          | 0/282 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
%tensorboard --logdir logs
