In [1]:
!pip install transformers[torch] datasets



In [2]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification

In [3]:
data_train = load_dataset('.', data_files='train.csv', split='train')
data_train

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 28741
})

In [4]:
data_test = load_dataset('.', data_files='test.csv', split='train')
data_test

Dataset({
    features: ['text', 'labels'],
    num_rows: 7186
})

In [5]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

device

device(type='cuda')

In [6]:
# torch.cuda.empty_cache()
print(torch.cuda.memory_summary(abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [7]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=11).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocess_function(df):
    return tokenizer(df["text"], truncation=True)

In [9]:
data_train_tokenized = data_train.map(preprocess_function, batched=True)
data_train_tokenized

Map:   0%|          | 0/28741 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 28741
})

In [10]:
data_test_tokenized = data_test.map(preprocess_function, batched=True)
data_test_tokenized

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 7186
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
training_args = TrainingArguments(
    output_dir='./trained_models',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01
)

In [13]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = data_train_tokenized,
    eval_dataset = data_test_tokenized,
    tokenizer = tokenizer,
    data_collator = data_collator
)

In [14]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6942
1000,0.495
1500,0.4141


Step,Training Loss
500,0.6942
1000,0.495
1500,0.4141
2000,0.3661
2500,0.3869
3000,0.3511
3500,0.2958


TrainOutput(global_step=3593, training_loss=0.4287656799452304, metrics={'train_runtime': 1895.7492, 'train_samples_per_second': 15.161, 'train_steps_per_second': 1.895, 'total_flos': 3474811750875648.0, 'train_loss': 0.4287656799452304, 'epoch': 1.0})

In [21]:
!pip install numpy



In [15]:
model.save_pretrained('trilytics_1')

In [19]:
import pandas as pd
test_data = pd.read_csv('test.csv')

In [22]:
import numpy as np

In [20]:
test_texts = test_data['text'].to_list()

In [27]:
a = []
for text_line in test_texts:
  outputs_2 = model(**tokenizer(text_line, truncation=True, return_tensors='pt').to(device))
  a.append(np.argmax(outputs_2.logits.cpu().detach().numpy()))

In [28]:
len(a)

7186

In [29]:
truth_values = test_data['labels'].to_list()
len(truth_values)

7186

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(truth_values, a))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      4497
           1       0.91      0.94      0.92      1455
           2       0.88      0.83      0.85       763
           3       0.00      0.00      0.00       146
           4       0.00      0.00      0.00        58
           5       0.00      0.00      0.00        50
           6       0.89      0.95      0.92       146
           7       0.00      0.00      0.00        33
           8       0.00      0.00      0.00        32
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         2

    accuracy                           0.89      7186
   macro avg       0.33      0.33      0.33      7186
weighted avg       0.89      0.89      0.89      7186



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
