In [3]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

MODEL = "klue/roberta-base"
INPUT = "data/sample_train.csv"
MAX_LEN = 256
dataset = load_dataset("csv", data_files=INPUT)['train']
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
dataset = dataset.train_test_split(0.1)

_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output
    
model = RobertaForSequenceClassification.from_pretrained(MODEL) # RobertaForSequenceClassification 는 BertForSequenceClassification 와 달리 pooler가 없는게 기본이기 때문에 문장 유사도에 사용 가능.

args = TrainingArguments(
    'runs/',
    per_device_train_batch_size=32,
    num_train_epochs=10,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn)

trainer.train()

FileNotFoundError: Unable to find '/Users/sihyun/Downloads/data/sample_train.csv' at /Users/sihyun/Downloads

In [13]:
import pandas as pd

TEST = "data/test.csv"
SUB = "data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(example_fn, remove_columns=['code1', 'code2'])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df['similar'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('./submissions/submission.csv', index=False)

Using custom data configuration default-119b339a9ccf4b98
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-119b339a9ccf4b98/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['pair_id', 'code1', 'code2'],
    num_rows: 179700
})


  0%|          | 0/179700 [00:00<?, ?ex/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 8
