In [None]:
pip install datasets

In [None]:
!pip install PyTorch
!pip install accelerate

In [None]:
import random
import os
import torch
import torch.utils
from torch.utils.data import Dataset
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorWithPadding, AdamW, get_linear_schedule_with_warmup, XLMRobertaForSequenceClassification, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.utils import resample

def tokenize(examples):
    return tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=True)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_result = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1_result,
        'mcc': mcc
	}


def find_indices(dataset):
    """Find indices of each label return a list for each label"""
    # Initialize empty lists to store indices for each label
    label_0_indices = []
    label_1_indices = []

    # Find indices of each label
    for id, sample in enumerate(dataset):
        if sample['label'] == 0:
            label_0_indices.append(id)
        elif sample['label'] == 1:
            label_1_indices.append(id)
    return label_0_indices, label_1_indices


def downsample_balanced(dataset, max_length):
    """Determiny the minority class and return balanced samples from the dataset"""
    # Make indices of each label
    label_0_indices, label_1_indices = find_indices(dataset)
    # Determine the length of the minority class (2528 is the maximum,
    # because this is the size of the minority class in the English dataset)
    min_length = min(len(label_0_indices), len(label_1_indices), max_length)
    # Downsample the majority class to match the length of the minority class
    label_0_downsampled = random.sample(label_0_indices, min_length)
    label_1_downsampled = random.sample(label_1_indices, min_length)
    # Concetenate the downsampled majority class indices with the minority class indices
    balanced = label_0_downsampled + label_1_downsampled

    print('label_0: ', len(label_0_downsampled))
    print('label_1: ', len(label_1_downsampled))

    dataset_train_balanced = [dataset[id] for id in balanced]

    # Lowercase sentences
    #dataset_train_balanced = [{'label': dataset[id]['label'], 'text': dataset[id]['text'].lower()} for id in balanced]
    return dataset_train_balanced


def make_balanced_dataset_one(dataset):
    """Make a balanced training dataset"""
    dataset_train_balanced = downsample_balanced(dataset, 2528)
    dataset_train_balanced = Dataset.from_list(dataset_train_balanced)
    print('train_size: ', len(dataset_train_balanced))
    return dataset_train_balanced

def make_balanced_dataset_two(dataset_nl, dataset_en):
    """Make a balanced training dataset"""
    nl_dataset_train_balanced = downsample_balanced(dataset_nl, 1264)
    print(nl_dataset_train_balanced)
    en_dataset_train_balanced = downsample_balanced(dataset_en, 1264)
    dataset_train_balanced = Dataset.from_list(nl_dataset_train_balanced + en_dataset_train_balanced)
    print('train_size: ', len(dataset_train_balanced))
    return dataset_train_balanced

# Make output directory
os.makedirs('output', exist_ok=True)

# Set seed
torch.manual_seed(42)

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Load Dutch CoLa
dataset_nl = load_dataset("GroNLP/dutch-cola")

# Load English CoLa
dataset_en = load_dataset("shivkumarganesh/CoLA")

# Remove unnecesary columns from the dataset
dataset_nl = dataset_nl.remove_columns(['Source', 'Original ID', 'Original annotation', 'Material added'])
dataset_en = dataset_en.remove_columns(['Unnamed: 0', 'id', 'etc'])
print(len(dataset_en['train']))

# Rename the Sentence and Acceptability columns
dataset_nl = dataset_nl.rename_column("Sentence", "text")
dataset_nl = dataset_nl.rename_column("Acceptability", "label")

# Make combined nl/en dataset
#dataset_train_balanced = make_balanced_dataset_two(dataset_nl['train'], dataset_en['train'])

# Make nl dataset
dataset_train_balanced = make_balanced_dataset_one(dataset_nl['train'])

# Make en dataset
#dataset_train_balanced = make_balanced_dataset_one(dataset_en['train'])

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# Add the padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
tokenized_dataset_nl = dataset_nl.map(tokenize, batched=True)
tokenized_dataset_train = dataset_train_balanced.map(tokenize, batched=True)

# Split the dataset
tokenized_train = tokenized_dataset_train
tokenized_test = tokenized_dataset_nl['test']
tokenized_valid = tokenized_dataset_nl['validation']

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load Roberta
model = XLMRobertaForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", num_labels = 2)

# Run the model on GPU
model.to(device)

optimizer = AdamW(model.parameters(), lr = 4e-5, eps = 1e-8, betas=(0.9, 0.999))

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/output',
    logging_dir='logs',
    learning_rate=4e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    logging_strategy='epoch',
    gradient_accumulation_steps=1,
    fp16=True,
)

# Create trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
trainer.train()

model.push_to_hub("shuisman/xlm-roberta-base-dutch-cola")

# Evaluate model performance
trainer.evaluate(tokenized_test)

# Save the model
model.save_pretrained('/content/output/fine-tuned-xlmr')