# Valence Model

This Jupyter Notebook is dedicated to fine-tuning a model for detecting Valence.

In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from transformers import TrainingArguments
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv('../Dataset/Training_Emotions.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('fake', axis=1)
df.head()

Unnamed: 0,id,tweet,valence,arousal
0,126727,"Andrew Brigden MP is clearly a virologist, sta...",negative,medium
1,98481,@RepMattGaetz I do not understand How trump lo...,negative,medium
2,45066,Only four nations allow elective abortions aft...,negative,low
3,62614,@brhodes Agreed. It's time to stop demonizing ...,negative,medium
4,15038,@shortiemagee New York State passed a law mand...,positive,low


### Prepare Data

In [4]:
def valence_vectorized(valence_series):
    valence_map = {'positive': 0, 'neutral': 1, 'negative': 2}
    valence_values = valence_series.map(valence_map)
    if valence_values.isnull().any() :
        raise ValueError("Invalid valence category found")
    return valence_values

In [5]:
def decode_vectorized(series):
    valence_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
    valence = series.map(valence_map)
    return pd.DataFrame({'valence': valence})

In [6]:
print("Unique valence values:", df['valence'].unique())


Unique valence values: ['negative' 'positive' 'neutral' 'mixed']


In [7]:
df = df[df['valence'] != 'mixed']
df['label'] = valence_vectorized(df['valence'])
df = df[['id','tweet','label']]
df['label'].value_counts()
df['label'].unique()
df = df.rename(columns={'tweet': 'text'})
df['label'] = df['label'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = valence_vectorized(df['valence'])


In [8]:
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

### Define Model

In [9]:
ROBERTA = "FacebookAI/xlm-roberta-base"

def tokenizer(checkpoint):
    return AutoTokenizer.from_pretrained(checkpoint)

tokenizer_RoBERTa_uncased = tokenizer(ROBERTA)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



In [10]:
def tokenize_function(batch, tokenizer):
    return tokenizer(batch["text"].tolist(), truncation=True)

In [11]:
data_collator_RoBERTa_uncased = DataCollatorWithPadding(tokenizer=tokenizer_RoBERTa_uncased, return_tensors="pt")

In [12]:
model_RoBERTa_uncased = AutoModelForSequenceClassification.from_pretrained(ROBERTA, num_labels=3, ignore_mismatched_sizes=True)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")

    return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [14]:
fold_results = []
model_name = 'Valence_Model'
batch_size = 8
num_labels = 3
for fold, (train_index, val_index) in enumerate(kf.split(train_val_df), 1):
    print(f"Training Fold {fold}")

    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]
    train_df = train_df.drop(columns=['id'])
    val_df = val_df.drop(columns=['id'])

    tokenized_datasets_RoBERTa_uncased = {
        "train": Dataset.from_dict(tokenize_function(train_df, tokenizer_RoBERTa_uncased)),
        "val": Dataset.from_dict(tokenize_function(val_df, tokenizer_RoBERTa_uncased))
    }

    tokenized_datasets_RoBERTa_uncased["train"] = tokenized_datasets_RoBERTa_uncased["train"].add_column("labels", train_df["label"].tolist())
    tokenized_datasets_RoBERTa_uncased["val"] = tokenized_datasets_RoBERTa_uncased["val"].add_column("labels", val_df["label"].tolist())


    model_RoBERTa_uncased = AutoModelForSequenceClassification.from_pretrained(ROBERTA, num_labels=num_labels)


    logging_steps = len(tokenized_datasets_RoBERTa_uncased["train"]) // batch_size


    training_args = TrainingArguments(
        output_dir=f"{model_name}_fold_{fold}",
        num_train_epochs=3,
        learning_rate=1e-5,
        save_strategy="epoch",
        save_total_limit=1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        log_level="info",
        label_smoothing_factor=0.1,
        weight_decay=0.005,
        lr_scheduler_type='cosine',
        warmup_steps=200
    )

    trainer = Trainer(
        model=model_RoBERTa_uncased,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=tokenized_datasets_RoBERTa_uncased["train"],
        eval_dataset=tokenized_datasets_RoBERTa_uncased["val"],
        data_collator=data_collator_RoBERTa_uncased,
        tokenizer=tokenizer_RoBERTa_uncased,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    fold_results.append(eval_results)

    print(f"Fold {fold} Results:", eval_results)
    print("\n")

avg_results = {key: np.mean([res[key] for res in fold_results]) for key in fold_results[0].keys()}
print("Average Results across all Folds:", avg_results)

Training Fold 1


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 35,988
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13,497
  Number of trainable parameters = 278,045,955


Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6307,0.577979,0.755854,0.668483,0.845743,0.703605
2,0.5328,0.59063,0.717511,0.724985,0.844299,0.720381
3,0.4737,0.632549,0.724978,0.715557,0.844743,0.720084



***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_1/checkpoint-4499
Configuration saved in Valence_Model_fold_1/checkpoint-4499/config.json
Model weights saved in Valence_Model_fold_1/checkpoint-4499/model.safetensors
tokenizer config file saved in Valence_Model_fold_1/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_1/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_1/checkpoint-8998
Configuration saved in Valence_Model_fold_1/checkpoint-8998/config.json
Model weights saved in Valence_Model_fold_1/checkpoint-8998/model.safetensors
tokenizer config file saved in Valence_Model_fold_1/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_1/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_1/checkpoint-4499] due t

Fold 1 Results: {'eval_loss': 0.6325486898422241, 'eval_precision': 0.7249779360343659, 'eval_recall': 0.7155571676063658, 'eval_acc': 0.8447432762836186, 'eval_f1': 0.720083734326234, 'eval_runtime': 16.8846, 'eval_samples_per_second': 532.911, 'eval_steps_per_second': 66.629, 'epoch': 3.0}


Training Fold 2


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6374,0.606333,0.763864,0.626681,0.837724,0.671727
2,0.5356,0.610122,0.722128,0.723696,0.835612,0.721532
3,0.4774,0.641887,0.726588,0.726761,0.844837,0.726659



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_2/checkpoint-4499
Configuration saved in Valence_Model_fold_2/checkpoint-4499/config.json
Model weights saved in Valence_Model_fold_2/checkpoint-4499/model.safetensors
tokenizer config file saved in Valence_Model_fold_2/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_2/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_2/checkpoint-8998
Configuration saved in Valence_Model_fold_2/checkpoint-8998/config.json
Model weights saved in Valence_Model_fold_2/checkpoint-8998/model.safetensors
tokenizer config file saved in Valence_Model_fold_2/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_2/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_2/checkpoint-4499] due t

Fold 2 Results: {'eval_loss': 0.6418867111206055, 'eval_precision': 0.7265880406194318, 'eval_recall': 0.7267606485793969, 'eval_acc': 0.8448371679448705, 'eval_f1': 0.7266590189111026, 'eval_runtime': 17.4206, 'eval_samples_per_second': 516.457, 'eval_steps_per_second': 64.579, 'epoch': 3.0}


Training Fold 3


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6392,0.556386,0.741149,0.701649,0.84895,0.719338
2,0.5346,0.590669,0.739846,0.720019,0.854285,0.726616
3,0.4756,0.619696,0.75006,0.73633,0.859175,0.742739



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_3/checkpoint-4499
Configuration saved in Valence_Model_fold_3/checkpoint-4499/config.json
Model weights saved in Valence_Model_fold_3/checkpoint-4499/model.safetensors
tokenizer config file saved in Valence_Model_fold_3/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_3/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_3/checkpoint-8998
Configuration saved in Valence_Model_fold_3/checkpoint-8998/config.json
Model weights saved in Valence_Model_fold_3/checkpoint-8998/model.safetensors
tokenizer config file saved in Valence_Model_fold_3/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_3/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_3/checkpoint-4499] due t

Fold 3 Results: {'eval_loss': 0.6196959614753723, 'eval_precision': 0.750059935108947, 'eval_recall': 0.7363300514914725, 'eval_acc': 0.8591752806491053, 'eval_f1': 0.7427388367687237, 'eval_runtime': 16.7805, 'eval_samples_per_second': 536.157, 'eval_steps_per_second': 67.042, 'epoch': 3.0}


Training Fold 4


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6613,0.585263,0.747473,0.66254,0.840391,0.697079
2,0.5512,0.58759,0.734171,0.702758,0.846949,0.716953
3,0.4969,0.628014,0.72717,0.712514,0.846504,0.719083



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_4/checkpoint-4499
Configuration saved in Valence_Model_fold_4/checkpoint-4499/config.json
Model weights saved in Valence_Model_fold_4/checkpoint-4499/model.safetensors
tokenizer config file saved in Valence_Model_fold_4/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_4/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_4/checkpoint-8998
Configuration saved in Valence_Model_fold_4/checkpoint-8998/config.json
Model weights saved in Valence_Model_fold_4/checkpoint-8998/model.safetensors
tokenizer config file saved in Valence_Model_fold_4/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_4/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_4/checkpoint-4499] due t

Fold 4 Results: {'eval_loss': 0.628013551235199, 'eval_precision': 0.7271701352011103, 'eval_recall': 0.712514143939841, 'eval_acc': 0.8465043903523397, 'eval_f1': 0.7190834261472148, 'eval_runtime': 17.0095, 'eval_samples_per_second': 528.94, 'eval_steps_per_second': 66.14, 'epoch': 3.0}


Training Fold 5


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6363,0.5712,0.717631,0.719537,0.837057,0.71856
2,0.5349,0.592588,0.716036,0.735232,0.843059,0.724642
3,0.476,0.618196,0.743757,0.734366,0.851951,0.738967



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_5/checkpoint-4499
Configuration saved in Valence_Model_fold_5/checkpoint-4499/config.json
Model weights saved in Valence_Model_fold_5/checkpoint-4499/model.safetensors
tokenizer config file saved in Valence_Model_fold_5/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_5/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_5/checkpoint-8998
Configuration saved in Valence_Model_fold_5/checkpoint-8998/config.json
Model weights saved in Valence_Model_fold_5/checkpoint-8998/model.safetensors
tokenizer config file saved in Valence_Model_fold_5/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_5/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_5/checkpoint-4499] due t

Fold 5 Results: {'eval_loss': 0.6181963086128235, 'eval_precision': 0.743757001571541, 'eval_recall': 0.7343657269000845, 'eval_acc': 0.8519506502167389, 'eval_f1': 0.7389670620585779, 'eval_runtime': 17.2277, 'eval_samples_per_second': 522.242, 'eval_steps_per_second': 65.302, 'epoch': 3.0}


Average Results across all Folds: {'eval_loss': 0.6280682444572449, 'eval_precision': 0.7345106097070793, 'eval_recall': 0.7251055477034322, 'eval_acc': 0.8494421530893346, 'eval_f1': 0.7295064156423706, 'eval_runtime': 17.06458, 'eval_samples_per_second': 527.3414, 'eval_steps_per_second': 65.9384, 'epoch': 3.0}


In [15]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")

    return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split


print("Training Final Model on Entire Dataset")

full_train_df = train_val_df.drop(columns=['id'])

train_data, val_data = train_test_split(full_train_df, test_size=0.1, random_state=42)


tokenized_dataset_full = Dataset.from_dict(tokenize_function(train_data, tokenizer_RoBERTa_uncased))
tokenized_dataset_full = tokenized_dataset_full.add_column("labels", train_data["label"].tolist())

tokenized_dataset_eval = Dataset.from_dict(tokenize_function(val_data, tokenizer_RoBERTa_uncased))
tokenized_dataset_eval = tokenized_dataset_eval.add_column("labels", val_data["label"].tolist())



final_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA, num_labels=num_labels)

final_training_args = TrainingArguments(
    output_dir=f"{model_name}_fold_{fold}",
        num_train_epochs=3,
        learning_rate=1e-5,
        save_strategy="epoch",
        save_total_limit=1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        log_level="info",
        label_smoothing_factor=0.1,
        weight_decay=0.005,
        lr_scheduler_type='cosine',
        warmup_steps=200
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    compute_metrics = compute_metrics,
    train_dataset=tokenized_dataset_full,
    eval_dataset=tokenized_dataset_eval,  
    tokenizer=tokenizer_RoBERTa_uncased,
)

final_trainer.train()

final_trainer.save_model(f"{model_name}_final")

print("Final Model Training Completed")

print("Evaluating Final Model on Test Set")

test_df = test_df.drop(columns=['id'])  # Assuming you have a test_df
tokenized_test_dataset = Dataset.from_dict(tokenize_function(test_df, tokenizer_RoBERTa_uncased))
tokenized_test_dataset = tokenized_test_dataset.add_column("labels", test_df["label"].tolist())

test_results = final_trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("Test Set Evaluation Results:")
for key, value in test_results.items():
    print(f"{key}: {value}")

predictions = final_trainer.predict(test_dataset=tokenized_test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

Training Final Model on Entire Dataset


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.6385,0.5921,0.713612,0.684117,0.834852,0.693678
2,0.5444,0.575558,0.723132,0.714478,0.847299,0.717505
3,0.4892,0.634695,0.712443,0.714025,0.843743,0.713156



***** Running Evaluation *****
  Num examples = 4499
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_5/checkpoint-5061
Configuration saved in Valence_Model_fold_5/checkpoint-5061/config.json
Model weights saved in Valence_Model_fold_5/checkpoint-5061/model.safetensors
tokenizer config file saved in Valence_Model_fold_5/checkpoint-5061/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_5/checkpoint-5061/special_tokens_map.json
Deleting older checkpoint [Valence_Model_fold_5/checkpoint-5061] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 4499
  Batch size = 8
Saving model checkpoint to Valence_Model_fold_5/checkpoint-10122
Configuration saved in Valence_Model_fold_5/checkpoint-10122/config.json
Model weights saved in Valence_Model_fold_5/checkpoint-10122/model.safetensors
tokenizer config file saved in Valence_Model_fold_5/checkpoint-10122/tokenizer_config.json
Special tokens file saved in Valence_Model_fold_5/checkpoint-1

Final Model Training Completed
Evaluating Final Model on Test Set



***** Running Evaluation *****
  Num examples = 4999
  Batch size = 8



***** Running Prediction *****
  Num examples = 4999
  Batch size = 8


Test Set Evaluation Results:
eval_loss: 0.5900675058364868
eval_precision: 0.7490561168212185
eval_recall: 0.7441107374962564
eval_acc: 0.864372874574915
eval_f1: 0.746557706668308
eval_runtime: 9.2603
eval_samples_per_second: 539.831
eval_steps_per_second: 67.492
epoch: 3.0
