In [28]:
import os
import typer
import pandas as pn
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_metric, load_dataset
from sklearn.preprocessing import LabelEncoder

metric = load_metric("accuracy")

In [29]:
train_file = '../filespace/splitted_dataset/train.csv'
test_file = '../filespace/splitted_dataset/test.csv'
label_column_name = 'target'
text_column_name = 'text_bert'
model_name = 'distilbert-base-uncased'

In [30]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [31]:
df = pn.read_csv(train_file, encoding='utf-8-sig')

le = LabelEncoder()
le.fit(df[label_column_name].values)
labels_mapping = dict(zip(range(len(le.classes_)), map(int, le.classes_)))

dataset = load_dataset('csv', data_files={
        'train': train_file,
        'test': test_file
    }, encoding='utf-8-sig')

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples[text_column_name], padding="max_length", truncation=True)

def transform_labels(label):
    return {'labels': le.transform([label[label_column_name]])[0]}

Found cached dataset csv (/home/dmytro/.cache/huggingface/datasets/csv/default-5a8744bb6d1c4f32/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
columns_to_remove = list(filter(lambda c: c not in [text_column_name, label_column_name], df.columns))
dataset = dataset.map(transform_labels, remove_columns=columns_to_remove)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"]  # .shuffle(seed=42).select(range(100))
eval_dataset = tokenized_datasets["test"]  # .shuffle(seed=42).select(range(100))

model = AutoModelForSequenceClassification.from_pretrained(model_name,
   num_labels=int(df[label_column_name].nunique()),
   id2label=labels_mapping,
   dropout=0.5)

training_args = TrainingArguments(output_dir="test_trainer",
  evaluation_strategy="epoch",
  per_device_train_batch_size=8,
  num_train_epochs=4,
  save_strategy='no')

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)

trainer.train()

Loading cached processed dataset at /home/dmytro/.cache/huggingface/datasets/csv/default-5a8744bb6d1c4f32/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d021ecc8de7241b4.arrow
Loading cached processed dataset at /home/dmytro/.cache/huggingface/datasets/csv/default-5a8744bb6d1c4f32/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-02a621e57c487a81.arrow
Loading cached processed dataset at /home/dmytro/.cache/huggingface/datasets/csv/default-5a8744bb6d1c4f32/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-60fe772f9a165153.arrow


Map:   0%|          | 0/1363 [00:00<?, ? examples/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 978.25 MiB total capacity; 218.19 MiB already allocated; 7.00 MiB free; 232.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF