In [21]:
from datasets import load_dataset
import datasets
boolq=load_dataset('super_glue','boolq', trust_remote_code=True)
copa=load_dataset('super_glue','copa', trust_remote_code=True)

In [22]:
boolq

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3245
    })
})

In [23]:
copa

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 500
    })
})

In [24]:
boolq=boolq.remove_columns(['idx','label'])
copa=copa.remove_columns(['idx','label'])
print(boolq,'\n',copa)

DatasetDict({
    train: Dataset({
        features: ['question', 'passage'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage'],
        num_rows: 3245
    })
}) 
 DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question'],
        num_rows: 500
    })
})


In [25]:
boolq=boolq.map(lambda x:{'task':"Boolq"})
copa=copa.map(lambda x:{'task':"COPA"})
print(boolq,'\n',copa)

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'task'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'task'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'task'],
        num_rows: 3245
    })
}) 
 DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'task'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'task'],
        num_rows: 100
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'task'],
        num_rows: 500
    })
})


In [26]:
boolq_data = boolq['train'].map(lambda examples: {'input_text': f"answer yes or no: {examples['question']} passage: {examples['passage']}"})
copa_data = copa['train'].map(lambda examples: {'input_text': f"{examples['premise']} what is the {examples['question']}: {examples['choice1']} or {examples['choice2']}?"})

boolq_data = boolq_data.remove_columns(['question', 'passage','task'])
copa_data = copa_data.remove_columns(['premise', 'choice1', 'choice2', 'question', 'task'])


In [33]:
boolq_data.save_to_disk("Boolq")
copa_data.save_to_disk("copa")

Saving the dataset (0/1 shards):   0%|          | 0/9427 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

In [27]:
print(boolq_data,'\n',copa_data)

Dataset({
    features: ['input_text'],
    num_rows: 9427
}) 
 Dataset({
    features: ['input_text'],
    num_rows: 400
})


In [29]:
dataset = datasets.concatenate_datasets([boolq_data, copa_data])


In [30]:
dataset.save_to_disk("Boolq_and_copq")

Saving the dataset (0/1 shards):   0%|          | 0/9827 [00:00<?, ? examples/s]

In [9]:
dataset['task'][1]

['boolq']

In [10]:
label2id = {'boolq': 0, 'copa': 1}


dataset = dataset.map(lambda examples: {**examples, 'labels': [label2id[task.lower()] for task in examples['task']]})

In [11]:
dataset['labels'][1]


[0]

In [12]:
dataset

Dataset({
    features: ['task', 'input_text', 'labels'],
    num_rows: 9827
})

In [13]:
dataset=dataset.train_test_split(test_size=0.3)

dataset

DatasetDict({
    train: Dataset({
        features: ['task', 'input_text', 'labels'],
        num_rows: 6878
    })
    test: Dataset({
        features: ['task', 'input_text', 'labels'],
        num_rows: 2949
    })
})

## training the classifier

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

device="cuda" if torch.cuda.is_available() else 'cpu'
model_name="bert-base-uncased"
model=AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples['input_text'], truncation=True, padding=True)
    tokenized_inputs['labels'] = examples['labels']  # Include labels in the tokenized inputs
    return tokenized_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2949 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['task', 'input_text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6878
    })
    test: Dataset({
        features: ['task', 'input_text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2949
    })
})

In [15]:
import evaluate

metric=evaluate.load('accuracy')

def compute_metric(eval_pred):
    logits, label=eval_pred
    predictions=logits.argmax(axis=-1)
    return metric.compute(references=label, predictions=predictions)

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    save_steps=150,
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_steps=150,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

In [32]:

trainer.train()

[2024-10-25 14:35:21,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/deon/miniconda3/envs/rakuten_project/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsi

  0%|          | 0/1290 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_name="./Bert_classifier"
model=AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [35]:
def classify(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class


classify(boolq_data[1]['input_text'])
    

0

In [20]:
print(boolq_data[1])

print(boolq_data[1]['task']) 

{'task': ['boolq'], 'input_text': 'do good samaritan laws protect those who help at an accident'}
['boolq']


In [28]:
classify(copa_data[1]['input_text'])

1

In [30]:
model.save_pretrained("./Bert_classifier")
tokenizer.save_pretrained("./Bert_classifier")

('./Bert_classifier/tokenizer_config.json',
 './Bert_classifier/special_tokens_map.json',
 './Bert_classifier/vocab.txt',
 './Bert_classifier/added_tokens.json',
 './Bert_classifier/tokenizer.json')