In [1]:
# Importing
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AutoConfig, AdamW, get_scheduler
import numpy as np
import evaluate
import torch
from datasets import load_dataset
import flask
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch.nn as nn

In [10]:
# Defining checkpoint and getting the model and tokenizer
checkpoint = "bert-base-uncased" # 
config = AutoConfig.from_pretrained(checkpoint, num_labels=1, problem_type="regression") # Chose regression because the WA values are continuous in the dataset
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Processing the dataset
raw_dataset = load_dataset('csv', data_files = "arg_quality_rank_30k.csv")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['argument', 'topic', 'set', 'WA', 'MACE-P', 'stance_WA', 'stance_WA_conf'],
        num_rows: 30497
    })
})

In [12]:
# Now, this dataset only has train data. So, I will split the data in 80-20, so that I can have train data and evaluate data to finetune
from datasets import DatasetDict
train_test_split = raw_dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': eval_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['argument', 'topic', 'set', 'WA', 'MACE-P', 'stance_WA', 'stance_WA_conf'],
        num_rows: 24397
    })
    validation: Dataset({
        features: ['argument', 'topic', 'set', 'WA', 'MACE-P', 'stance_WA', 'stance_WA_conf'],
        num_rows: 6100
    })
})

In [13]:
# ensuring it runs on GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [14]:
# Defining the tokenize function
def tokenize_function(dataset):
    return tokenizer(dataset['argument'], padding = True, truncation = True)

In [15]:
tokenized_dataset = dataset_dict.map(tokenize_function, batched = True)
print(tokenized_dataset['train'][:5])

Map: 100%|██████████| 24397/24397 [00:03<00:00, 6973.64 examples/s]
Map: 100%|██████████| 6100/6100 [00:00<00:00, 7792.19 examples/s]

{'argument': ['people are living healthier longer now and forcing retirement based on arbitrary guidelines like age is a bad idea.', 'factory farming causes cruelty to animals', 'it is reverse discrimination and only benefits certain minorities. asians do not benefit from it.', 'although we hope never to use them, we need to keep our nuclear weapons as a deterrent against others using them therefore we should not abolish them.', 'we should not abandon the use of school uniforms because it helps to create a fair and even playing field between students who are rich and poor alike.'], 'topic': ['We should end mandatory retirement', 'We should ban factory farming', 'We should end affirmative action', 'We should fight for the abolition of nuclear weapons', 'We should abandon the use of school uniform'], 'set': ['train', 'test', 'train', 'train', 'dev'], 'WA': [0.941212704, 1.0, 0.833479962, 0.943428971, 0.864049819], 'MACE-P': [0.886630134, 0.977731841, 0.432486809, 0.873463115, 0.785187507




In [16]:
# Applying padding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
data_collator;

In [17]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['argument', 'topic', 'set', 'WA', 'MACE-P', 'stance_WA', 'stance_WA_conf', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24397
    })
    validation: Dataset({
        features: ['argument', 'topic', 'set', 'WA', 'MACE-P', 'stance_WA', 'stance_WA_conf', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6100
    })
})


In [18]:
# Now, for the full training, we need to remove the columns that the model does not expect and rename stance to label
tokenized_dataset = tokenized_dataset.remove_columns(["argument", "topic", "set", "stance_WA", "MACE-P", "stance_WA_conf"])
tokenized_dataset = tokenized_dataset.rename_column("WA", "labels") # WA and MACE-P both have continous values, and are both labels. I chose WA, but can choose MACE-P as well
tokenized_dataset.set_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24397
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6100
    })
})

In [19]:
print(tokenized_dataset["train"]["labels"])
print(tokenized_dataset["validation"]["labels"])

tensor([0.9412, 1.0000, 0.8335,  ..., 1.0000, 0.7650, 0.9060])
tensor([0.8102, 0.6280, 0.8768,  ..., 0.4755, 0.7565, 0.7495])


In [22]:
# Now, we define our dataloaders
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)
# To quickly inspect there is no mistake
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 59]),
 'token_type_ids': torch.Size([8, 59]),
 'attention_mask': torch.Size([8, 59])}

In [24]:
# Moving the batch to the same device as the model
batch = {k: v.to(device) for k, v in batch.items()}

In [26]:
# Now the training loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = nn.MSELoss()(outputs.logits.squeeze(), batch["labels"])
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 9150/9150 [05:28<00:00, 28.91it/s]

In [29]:
model.save_pretrained("trained_model")
tokenizer.save_pretrained("trained_model")

('NLP/To_Download/tokenizer_config.json',
 'NLP/To_Download/special_tokens_map.json',
 'NLP/To_Download/vocab.txt',
 'NLP/To_Download/added_tokens.json',
 'NLP/To_Download/tokenizer.json')

In [27]:
# A test function
def evaluate_argument_quality(argument):
    inputs = tokenizer(argument, padding=True, truncation=True, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        score = outputs.logits.squeeze().cpu().numpy()
    return score

In [28]:
arguments = [
    "Your wife was seen in the market. So, I assumed your wife was not at home.",
    "Cows have four legs. Donkeys have four legs. So cows are donkeys.",
    "The sky is blue during the day.",
    "Water boild at 100 degrees celsius."
]

for arg in arguments:
    score = evaluate_argument_quality(arg)
    print(f"Quality Score of argument: {score}")

Quality Score of argument: 0.36732128262519836
Quality Score of argument: 0.5132728815078735
Quality Score of argument: 0.5154016613960266
Quality Score of argument: 0.9191880226135254
