In [1]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification

In [2]:
data_train = load_dataset('.', data_files='data/train.csv', split='train')
data_train

Dataset({
    features: ['text', 'labels'],
    num_rows: 28741
})

In [3]:
data_test = load_dataset('.', data_files='data/test.csv', split='train')
data_test

Dataset({
    features: ['text', 'labels'],
    num_rows: 7186
})

In [4]:
import torch
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")

device

device(type='mps')

In [5]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=10).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def preprocess_function(df):
    return tokenizer(df["text"], truncation=True)

In [7]:
data_train_tokenized = data_train.map(preprocess_function, batched=True)
data_train_tokenized

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 28741
})

In [8]:
data_test_tokenized = data_test.map(preprocess_function, batched=True)
data_test_tokenized

Map:   0%|          | 0/7186 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 7186
})

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
training_args = TrainingArguments(
    output_dir='./trained_models',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01
)

In [11]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = data_train_tokenized,
    eval_dataset = data_test_tokenized,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [12]:
trainer.train()



  0%|          | 0/7186 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
model.save_pretrained('trilytics_1')