##model train for baseline with sentences only no emotions

In [None]:
# Install necessary packages
!pip install transformers scikit-learn -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
import json
from tqdm import tqdm
import time

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/value_train.tsv", sep="\t")
val_df = pd.read_csv("/content/drive/MyDrive/value_val.tsv", sep="\t")

# Reduce training set size for faster runs
train_df = train_df.sample(n=20000, random_state=42).reset_index(drop=True)

# Load label columns
with open("/content/drive/MyDrive/value_label_columns.json") as f:
    label_columns = json.load(f)

# Select input column (sentence only)
train_texts = train_df["input_sentence_only"].tolist()
val_texts = val_df["input_sentence_only"].tolist()

train_labels = train_df[label_columns].values
val_labels = val_df[label_columns].values

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_columns),
    problem_type="multi_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset class
class ValueDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# DataLoaders
train_ds = ValueDataset(train_texts, train_labels)
val_ds = ValueDataset(val_texts, val_labels)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop (3 epochs)
epochs = 3
model.train()
for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    for batch in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"\nEpoch {epoch+1} loss: {total_loss:.4f} | Time: {round((time.time()-start)/60, 2)} min")

# Evaluation
# Evaluation
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in val_dl:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        preds = (logits > 0.5).astype(int)
        y_true.extend(labels)
        y_pred.extend(preds)

# 🔧 Fix types for sklearn
y_true = np.array(y_true).astype(int)
y_pred = np.array(y_pred).astype(int)

# ✅ Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_columns, zero_division=0))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [08:10<00:00,  5.10it/s]



Epoch 1 loss: 326.4446 | Time: 8.17 min


Epoch 2: 100%|██████████| 2500/2500 [08:09<00:00,  5.11it/s]



Epoch 2 loss: 169.1792 | Time: 8.15 min


Epoch 3: 100%|██████████| 2500/2500 [08:09<00:00,  5.11it/s]



Epoch 3 loss: 105.6821 | Time: 8.15 min

Classification Report:
                                        precision    recall  f1-score   support

                  Achievement attained       0.98      0.55      0.71      1930
               Achievement constrained       1.00      0.09      0.17      1433
          Benevolence: caring attained       0.62      0.27      0.38      1917
       Benevolence: caring constrained       0.94      0.97      0.96      3256
   Benevolence: dependability attained       0.92      0.23      0.36      1724
Benevolence: dependability constrained       0.82      0.81      0.81      2691
    Conformity: interpersonal attained       0.94      0.58      0.72      2436
 Conformity: interpersonal constrained       0.71      0.69      0.70      2051
            Conformity: rules attained       0.00      0.00      0.00      1122
         Conformity: rules constrained       0.96      0.39      0.55      1598
                         Face attained       0.95     

In [None]:
model.save_pretrained("/content/model1_baseline")
tokenizer.save_pretrained("/content/model1_baseline")


('/content/model1_baseline/tokenizer_config.json',
 '/content/model1_baseline/special_tokens_map.json',
 '/content/model1_baseline/vocab.txt',
 '/content/model1_baseline/added_tokens.json')

In [None]:
import shutil
from google.colab import files

# Zip the directory
shutil.make_archive("/content/model1_baseline", 'zip', "/content/model1_baseline")

# Trigger download
files.download("/content/model1_baseline.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Model 2 bert-emotion-enhanced-value-classifier

In [None]:
# Install necessary packages
!pip install transformers scikit-learn -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report, accuracy_score
import json
from tqdm import tqdm
import time

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/value_train.tsv", sep="\t")
val_df = pd.read_csv("/content/drive/MyDrive/value_val.tsv", sep="\t")

# Reduce training set size for faster runs
train_df = train_df.sample(n=20000, random_state=42).reset_index(drop=True)

# Load label columns
with open("/content/drive/MyDrive/value_label_columns.json") as f:
    label_columns = json.load(f)

# Select input column (sentence + predicted emotions)
train_texts = train_df["input_with_emotions"].tolist()
val_texts = val_df["input_with_emotions"].tolist()

train_labels = train_df[label_columns].values
val_labels = val_df[label_columns].values

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_columns),
    problem_type="multi_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset class
class ValueDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# DataLoaders
train_ds = ValueDataset(train_texts, train_labels)
val_ds = ValueDataset(val_texts, val_labels)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop (3 epochs)
epochs = 3
model.train()
for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    for batch in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"\nEpoch {epoch+1} loss: {total_loss:.4f} | Time: {round((time.time()-start)/60, 2)} min")

# Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in tqdm(val_dl, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        preds = (logits > 0.5).astype(int)
        y_true.extend(labels)
        y_pred.extend(preds)

# Convert to int
y_true = np.array(y_true).astype(int)
y_pred = np.array(y_pred).astype(int)

# Accuracy (subset accuracy)
exact_match_accuracy = accuracy_score(y_true, y_pred)
print(f"\nSubset Accuracy: {exact_match_accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_columns, zero_division=0))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [08:12<00:00,  5.08it/s]



Epoch 1 loss: 316.5886 | Time: 8.21 min


Epoch 2: 100%|██████████| 2500/2500 [08:11<00:00,  5.09it/s]



Epoch 2 loss: 159.5377 | Time: 8.19 min


Epoch 3: 100%|██████████| 2500/2500 [08:11<00:00,  5.08it/s]



Epoch 3 loss: 100.4813 | Time: 8.2 min


Evaluating: 100%|██████████| 12049/12049 [11:50<00:00, 16.95it/s]



Subset Accuracy: 0.6965

Classification Report:
                                        precision    recall  f1-score   support

                  Achievement attained       0.94      0.52      0.67      1930
               Achievement constrained       1.00      0.23      0.37      1433
          Benevolence: caring attained       0.89      0.40      0.55      1917
       Benevolence: caring constrained       0.83      0.99      0.90      3256
   Benevolence: dependability attained       0.88      0.37      0.52      1724
Benevolence: dependability constrained       0.85      0.81      0.83      2691
    Conformity: interpersonal attained       0.96      0.66      0.78      2436
 Conformity: interpersonal constrained       0.72      0.65      0.68      2051
            Conformity: rules attained       0.00      0.00      0.00      1122
         Conformity: rules constrained       0.98      0.45      0.61      1598
                         Face attained       0.87      0.78      0.82 

In [None]:
model.save_pretrained("/content/model2_emotion_enhanced")
tokenizer.save_pretrained("/content/model2_emotion_enhanced")


('/content/model2_emotion_enhanced/tokenizer_config.json',
 '/content/model2_emotion_enhanced/special_tokens_map.json',
 '/content/model2_emotion_enhanced/vocab.txt',
 '/content/model2_emotion_enhanced/added_tokens.json')

In [None]:
from zipfile import ZipFile
import shutil

# Zip Model 2
shutil.make_archive("/content/model2_emotion_enhanced", 'zip', "/content/model2_emotion_enhanced")

# Download
from google.colab import files
files.download("/content/model2_emotion_enhanced.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>