In [None]:
!pip install -U transformers datasets evaluate accelerate
!pip install scikit-learn
!pip install tensorboard

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

import evaluate
import glob
import numpy as np

In [None]:
BATCH_SIZE = 64
NUM_PROCS = 32
LR = 1e-4
EPOCHS = 10
MODEL = 'bert-base-uncased'
OUT_DIR = 'arxiv_bert'
PATH = {
    "train":'NLP2.csv',
    "test":'test.csv',
    "validation":'validation.csv'
}

In [None]:
# # prompt: how do i rename dataset columsn

# dataset = load_dataset(PATH, split='train')
# dataset = dataset.rename_column("text", "sentence1")
# dataset = dataset.rename_column("label", "labels")


In [None]:
train_dataset = load_dataset("/content",data_files=PATH, split='train')
valid_dataset = load_dataset("/content",data_files=PATH, split='validation')
test_dataset = load_dataset("/content",data_files=PATH, split='test')
# dataset = load_dataset(PATH, split='train')
train_dataset = train_dataset.rename_column("Text", "text")
train_dataset = train_dataset.rename_column("Labels", "label")
valid_dataset = valid_dataset.rename_column("Text", "text")
valid_dataset = valid_dataset.rename_column("Labels", "label")
test_dataset = test_dataset.rename_column("Text", "text")
test_dataset = test_dataset.rename_column("Labels", "label")
print(train_dataset)
print(valid_dataset)
print(test_dataset)

In [None]:
train_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def preprocess_function(examples):
    # print(examples)
    return tokenizer(
        examples["text"],
        truncation=True,
    )

In [None]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=NUM_PROCS
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_sample = preprocess_function(train_dataset[0])
print(tokenized_sample)
print(f"Length of tokenized IDs: {len(tokenized_sample.input_ids)}")
print(f"Length of attention mask: {len(tokenized_sample.attention_mask)}")

In [None]:
tokenized_sample = preprocess_function(train_dataset[0])
print(tokenized_sample)

In [None]:
accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=2,
)

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    fp16=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

history = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.775742,0.470588
2,No log,0.717178,0.5
3,No log,0.763219,0.529412
4,No log,0.885006,0.470588
5,No log,0.831539,0.441176
6,No log,0.846514,0.441176
7,No log,0.889738,0.470588
8,No log,0.938234,0.352941
9,No log,0.955523,0.382353
10,No log,0.95771,0.411765


In [None]:
trainer.evaluate(tokenized_test)

{'eval_loss': 0.6953545808792114,
 'eval_accuracy': 0.5172413793103449,
 'eval_runtime': 0.0346,
 'eval_samples_per_second': 837.463,
 'eval_steps_per_second': 28.878,
 'epoch': 10.0}

In [None]:
# AutoModelForSequenceClassification.from_pretrained(f"arxiv_bert/checkpoint-4440")

# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# classify = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

# all_files = glob.glob('inference_data/*')
# for file_name in all_files:
#     file = open(file_name)
#     content = file.read()
#     print(content)
#     result = classify(content)
#     print('PRED: ', result)
#     print('GT: ', file_name.split('_')[-1].split('.txt')[0])
#     print('\n')