# 1. Importing prepared Data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_excel('data/prepared_data.xlsx')
data.drop(columns= 'Unnamed: 0', inplace= True)
data.rename(columns= {'class' : 'label'}, inplace= True)
data['label'], _ = data['label'].factorize()

In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('prepared1.csv')
data = data.sample(frac=1, random_state= 14).reset_index(drop=True)

In [2]:
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

train_dataset = ds.dataset(pa.Table.from_pandas(data[:10000]).to_batches())
### convert train to Huggingface dataset
hg_train_dataset = Dataset(pa.Table.from_pandas(data[:10000]))


test_dataset = ds.dataset(pa.Table.from_pandas(data[10000:]).to_batches())
### convert test to Huggingface dataset
hg_test_dataset = Dataset(pa.Table.from_pandas(data[10000:]))

# 2. Transformer Model

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-sentiment-balanced')

In [4]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding = True)

In [5]:
#vectorized = preprocess_function(list(data['text']))
tokenized_train = hg_train_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
tokenized_test = hg_test_dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
columns_to_return = ['input_ids', 'label', 'attention_mask']
tokenized_train.set_format(type='torch', columns=columns_to_return)
tokenized_test.set_format(type='torch', columns=columns_to_return)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [6]:
tokenized_train['label'][0]

tensor(4)

In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding= True)

In [8]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
id2label = {0: "Z", 1: "M", 2: "J", 3: "U", 4: "E", 5: "K", 6: "I"}
label2id = {"Z": 0, "M": 1, "J": 2, "U": 3, "E": 4, "K": 5, "I": 6}

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny-sentiment-balanced", num_labels=7, id2label=id2label, label2id=label2id,
    ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny-sentiment-balanced and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 312]) in the checkpoint and torch.Size([7, 312]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train() # 0.83 ~ 0.84

***** Running training *****
  Num examples = 10000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
  Number of trainable parameters = 11786359
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.464,0.980657,0.706147
2,0.9865,0.837866,0.750125


***** Running Evaluation *****
  Num examples = 4002
  Batch size = 16
Saving model checkpoint to my_awesome_model\checkpoint-625
Configuration saved in my_awesome_model\checkpoint-625\config.json
Model weights saved in my_awesome_model\checkpoint-625\pytorch_model.bin
tokenizer config file saved in my_awesome_model\checkpoint-625\tokenizer_config.json
Special tokens file saved in my_awesome_model\checkpoint-625\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4002
  Batch size = 16
Saving model checkpoint to my_awesome_model\checkpoint-1250
Configuration saved in my_awesome_model\checkpoint-1250\config.json
Model weights saved in my_awesome_model\checkpoint-1250\pytorch_model.bin
tokenizer config file saved in my_awesome_model\checkpoint-1250\tokenizer_config.json
Special tokens file saved in my_awesome_model\checkpoint-1250\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from my_aw

TrainOutput(global_step=1250, training_loss=1.1613040893554687, metrics={'train_runtime': 280.3788, 'train_samples_per_second': 71.332, 'train_steps_per_second': 4.458, 'total_flos': 147580293120000.0, 'train_loss': 1.1613040893554687, 'epoch': 2.0})

https://huggingface.co/docs/transformers/tasks/sequence_classification