### BERT regressor 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the dataset, check and remove null values
data_01 = pd.read_csv('Emo_Bank_VAD.csv')
null_rows = data_01[data_01['text'].isnull()]
print(null_rows)
data_01 = data_01.dropna(subset=['text'])
data_01.isnull().sum()

# Split the data
x_train, x_test, y_train, y_test = train_test_split(data_01["text"], data_01[["V", "A", "D"]], test_size=0.1, shuffle=True, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True, random_state=1)
print("Data shapes:", x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape)


                          id split     V     A     D text
8281  easy_money_13624_13628   dev  2.78  2.89  2.78  NaN
Data shapes: (8022,) (892,) (991,) (8022, 3) (892, 3) (991, 3)


In [2]:
#tokenize and pad
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 170
x_train_pad = tokenizer(x_train.tolist(), padding='max_length', truncation=True, max_length=170, return_tensors="tf")
x_val_pad = tokenizer(x_val.tolist(), padding='max_length', truncation=True, max_length=170, return_tensors="tf")
x_test_pad = tokenizer(x_test.tolist(), padding='max_length', truncation=True, max_length=170, return_tensors="tf")


2024-12-27 11:10:18.591542: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-12-27 11:10:18.591573: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-12-27 11:10:18.591581: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-12-27 11:10:18.591673: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-27 11:10:18.591929: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load BERT model 
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define the model
def create_model():
    # BERT input/output layers
    input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    bert_output = bert_model(input_ids, attention_mask=attention_mask)
    pooled_output = bert_output.pooler_output  

    # Dense layer for regression
    dense = Dense(64, activation='relu')(pooled_output)  
    output = Dense(3, activation='tanh')(dense)  # Output layer for 3 values (V, A, D), using 'tanh' to stay in range [-1, 1]

    # Define the complete model
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    return model

# Instantiate the model
model_BERT = create_model()


# Compile the model 
model_BERT.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), 
              loss='mean_squared_error', 
              metrics=['mae']) 



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [5]:
# Train the model 
history = model_BERT.fit(
    x={'input_ids': x_train_pad['input_ids'], 'attention_mask': x_train_pad['attention_mask']},
    y=y_train,
    validation_data=({'input_ids': x_val_pad['input_ids'], 'attention_mask': x_val_pad['attention_mask']}, y_val),
    epochs=6,  
    batch_size=16  
)

# Evaluate on the test set
test_loss, test_mae = model_BERT.evaluate({'input_ids': x_test_pad['input_ids'], 'attention_mask': x_test_pad['attention_mask']}, y_test)
print(f"Test MSE: {test_loss}, Test MAE: {test_mae}")

# Predict on the test set
y_pred = model_BERT.predict({'input_ids': x_test_pad['input_ids'], 'attention_mask': x_test_pad['attention_mask']})

# Compute additional evaluation metrics like MSE and MAE
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}, Mean Absolute Error: {mae}")


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test MSE: 4.150419235229492, Test MAE: 2.0174574851989746
Mean Squared Error: 4.150418706782473, Mean Absolute Error: 2.017457382898718


In [6]:
# Ensure that y_test is converted to a numpy array
y_test_np = y_test.to_numpy()

# Predict on the tokenized test set using the trained BERT model
y_pred = model_BERT.predict({'input_ids': x_test_pad['input_ids'], 'attention_mask': x_test_pad['attention_mask']})

# Convert predictions to numpy array
y_pred_np = np.array(y_pred)

# Define RMSE function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluate performance
def evaluate_performance(y_true, y_pred):
    metrics = {}
    metrics['RMSE_V'] = rmse(y_true[:, 0], y_pred[:, 0])
    metrics['RMSE_A'] = rmse(y_true[:, 1], y_pred[:, 1])
    metrics['RMSE_D'] = rmse(y_true[:, 2], y_pred[:, 2])
    
    metrics['MAE_V'] = mean_absolute_error(y_true[:, 0], y_pred[:, 0])
    metrics['MAE_A'] = mean_absolute_error(y_true[:, 1], y_pred[:, 1])
    metrics['MAE_D'] = mean_absolute_error(y_true[:, 2], y_pred[:, 2])
    
    metrics['MSE_V'] = mean_squared_error(y_true[:, 0], y_pred[:, 0])
    metrics['MSE_A'] = mean_squared_error(y_true[:, 1], y_pred[:, 1])
    metrics['MSE_D'] = mean_squared_error(y_true[:, 2], y_pred[:, 2])
    
    return metrics

# Calculate performance metrics for the test dataset
test_metrics = evaluate_performance(y_test_np, y_pred_np)

# Print evaluation metrics
print("Test Dataset Evaluation:")
print(f'RMSE for V: {round(test_metrics["RMSE_V"], 2)}')
print(f'RMSE for A: {round(test_metrics["RMSE_A"], 2)}')
print(f'RMSE for D: {round(test_metrics["RMSE_D"], 2)}\n')
print(f'MAE for V: {round(test_metrics["MAE_V"], 2)}')
print(f'MAE for A: {round(test_metrics["MAE_A"], 2)}')
print(f'MAE for D: {round(test_metrics["MAE_D"], 2)}\n')
print(f'MSE for V: {round(test_metrics["MSE_V"], 2)}')
print(f'MSE for A: {round(test_metrics["MSE_A"], 2)}')
print(f'MSE for D: {round(test_metrics["MSE_D"], 2)}\n')

# Convert predictions to DataFrame and round to two decimal places
y_pred_df = pd.DataFrame(y_pred_np, columns=["V_pred", "A_pred", "D_pred"]).round(2)

# Concatenate real and predicted values for comparison
test_comparison = pd.concat([y_test.reset_index(drop=True), y_pred_df], axis=1)

# Print comparison table
print("Test Data Comparison:\n", test_comparison.head())

# Save the comparison table to a CSV file
test_comparison.to_csv("test_comparison_BERT_regression_03.csv", index=False)


Test Dataset Evaluation:
RMSE for V: 1.99
RMSE for A: 2.05
RMSE for D: 2.07

MAE for V: 1.96
MAE for A: 2.04
MAE for D: 2.06

MSE for V: 3.96
MSE for A: 4.21
MSE for D: 4.28

Test Data Comparison:
       V     A    D  V_pred  A_pred  D_pred
0  3.00  3.00  3.0     1.0     1.0     1.0
1  2.90  3.00  3.3     1.0     1.0     1.0
2  2.44  3.22  3.0     1.0     1.0     1.0
3  3.40  3.00  3.1     1.0     1.0     1.0
4  2.70  3.10  2.9     1.0     1.0     1.0
