In [1]:
import pandas as pd
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


In [2]:
TRAIN_DF = pd.read_csv("dataset/train.csv", delimiter="\t")
TEST_DF = pd.read_csv("dataset/test.csv", delimiter="\t")

In [3]:
# TRAIN_DF = df.loc[~df.index.isin(TEST_DF.index)]

In [4]:
TRAIN_DF

Unnamed: 0,name,path,emotion,text
0,03-01-03-01-01-01-05,RAVDESS_data\Actor_05\03-01-03-01-01-01-05.wav,happiness,Kids are talking by the door
1,03-01-07-01-02-02-17,RAVDESS_data\Actor_17\03-01-07-01-02-02-17.wav,disgust,Dogs are sitting by the door
2,03-01-05-02-02-02-14,RAVDESS_data\Actor_14\03-01-05-02-02-02-14.wav,anger,Dogs are sitting by the door
3,03-01-03-02-02-01-20,RAVDESS_data\Actor_20\03-01-03-02-02-01-20.wav,happiness,Dogs are sitting by the door
4,03-01-07-01-02-02-05,RAVDESS_data\Actor_05\03-01-07-01-02-02-05.wav,disgust,Dogs are sitting by the door
...,...,...,...,...
763,03-01-03-02-02-02-03,RAVDESS_data\Actor_03\03-01-03-02-02-02-03.wav,happiness,Dogs are sitting by the door
764,03-01-07-01-02-01-22,RAVDESS_data\Actor_22\03-01-07-01-02-01-22.wav,disgust,Dogs are sitting by the door
765,03-01-06-02-02-02-16,RAVDESS_data\Actor_16\03-01-06-02-02-02-16.wav,fear,Dogs are sitting by the door
766,03-01-06-01-01-02-03,RAVDESS_data\Actor_03\03-01-06-01-01-02-03.wav,fear,Kids are talking by the door


In [5]:
# TRAIN_DF['text'] = TRAIN_DF['text'] + "\nContext:" + TRAIN_DF['Response']

## Declare Functions

In [6]:
# from qdrant_client import QdrantClient
# import torch
# from sentence_transformers import SentenceTransformer

# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Device: {device}")

# EMBEDDINGS_MODEL = SentenceTransformer(
#     "dunzhang/stella_en_1.5B_v5",
#     trust_remote_code=True,
#     device=device,
#     #cache_folder='/media/data/hugging_face_cache'
# )

# qdrant_client = QdrantClient(
#     url="https://cf521759-86ad-49b4-b7f7-07fe3bb5f2ec.europe-west3-0.gcp.cloud.qdrant.io:6333",
#     api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.-Uof-NN6Q2IUWexHgY26SBVNHKIiJP32fF2gchKkWgI",
# )


### Bert Training functions

In [7]:


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Convert to lists to ensure indexing works correctly
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Verify data integrity
        assert len(self.texts) == len(
            self.labels), "Texts and labels must have the same length"

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError(
                f"Index {idx} out of bounds for dataset of size {len(self.texts)}")

        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def train_model(model, train_loader, val_loader, device, le, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    best_accuracy = 0

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')

        # Training
        model.train()
        train_loss = 0
        train_steps = 0

        #for batch_idx, batch in enumerate(tqdm(train_loader, desc='Training')):
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1
            
            # Print every 50 steps
            if (batch_idx + 1) % 50 == 0:
                avg_train_loss = train_loss / train_steps
                print(f'Step {batch_idx + 1} - Average training loss: {avg_train_loss:.4f}')

        avg_train_loss = train_loss / train_steps
        print(f'Average training loss: {avg_train_loss:.4f}')

        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0
        all_predictions = []
        all_true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_true_labels.extend(labels.cpu().numpy())
                val_steps += 1

        # Calculate metrics
        avg_val_loss = val_loss / val_steps
        accuracy = (np.array(all_predictions) ==
                    np.array(all_true_labels)).mean()

        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Validation accuracy: {accuracy:.4f}')
        print('\nClassification Report:')
        print(classification_report(all_true_labels, all_predictions,
                                    target_names=le.classes_))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'New best model saved with accuracy: {accuracy:.4f}')



In [8]:
TRAIN_DF.shape # = TRAIN_DF[:18840]

(768, 4)

In [9]:
# TRAIN_DF.to_csv("Data/TRAIN_DF.csv")

In [10]:
TRAIN_DF.shape

(768, 4)

In [11]:
TEST_DF.shape[0]

192

In [12]:
TEST_DF['emotion'].value_counts()

emotion
disgust      39
happiness    39
fear         38
sadness      38
anger        38
Name: count, dtype: int64

In [13]:
# Modify the main function to use TRAIN_DF for training and TEST_DF for testing
def main():
    try:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Load the training and test data
        train_df = TRAIN_DF # pd.read_csv('Data/TRAIN_DF.csv')
        test_df = TEST_DF # pd.read_csv('Data/TEST_DF.csv')

        # Data validation
        required_columns = ['text', 'emotion']
        if not all(col in train_df.columns for col in required_columns):
            raise ValueError(f"Dataset must contain columns: {required_columns}")
     
        train_df = train_df.dropna(subset=['text', 'emotion'])
        test_df = test_df.dropna(subset=['text', 'emotion'])

        # Reset index after dropping NaN values
        train_df = train_df.reset_index(drop=True)
        test_df = test_df.reset_index(drop=True)

        # Convert labels to numerical values
        le = LabelEncoder()
        train_df['emotion'] = le.fit_transform(train_df['emotion'])
        test_df['emotion'] = le.transform(test_df['emotion'])  # Use the same encoder for test data

        # Split the dataset into train and validation sets
        print("\nSplitting training dataset...")
        
        val_size = test_df.shape[0]
        test_size = val_size / len(train_df)
        
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            train_df['text'],
            train_df['emotion'],
            test_size=test_size,
            random_state=42,
            stratify=train_df['emotion']
        )

        # Initialize tokenizer and model
        print("Initializing BERT model and tokenizer...")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        num_labels = len(le.classes_)
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=num_labels
        )

        # Create datasets
        train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
        val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

        # Create dataloaders
        train_loader = DataLoader(train_dataset, batch_size=40, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=40)

        # Move model to GPU if available
        model.to(device)

        # Train the model
        train_model(model, train_loader, val_loader, device, le, 4)

        # Testing on TEST_DF (used only for testing at the end)
        print("\nEvaluating model on the test dataset...")
        test_texts = test_df['text']
        test_labels = test_df['emotion']

        test_dataset = CustomDataset(test_texts, test_labels, tokenizer)
        test_loader = DataLoader(test_dataset, batch_size=40)

        # Evaluate the model on test data
        model.eval()
        all_predictions = []
        all_true_labels = []
        test_loss = 0
        test_steps = 0

        with torch.no_grad():
            for batch in tqdm(test_loader, desc='Testing'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                test_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_true_labels.extend(labels.cpu().numpy())
                test_steps += 1

        avg_test_loss = test_loss / test_steps
        print(f'Average test loss: {avg_test_loss:.4f}')
        print('\nTest Classification Report:')
        print(classification_report(all_true_labels, all_predictions, target_names=le.classes_))

        # Save the final model and tokenizer
        model.save_pretrained('ravdess_bert')
        tokenizer.save_pretrained('ravdess_bert')
        np.save('label_classes.npy', le.classes_)

    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        print("\nFull error details:")
        import traceback
        traceback.print_exc()


In [14]:
# without explanations
main()


Splitting training dataset...
Initializing BERT model and tokenizer...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/4
Average training loss: 1.6363


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.59it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average validation loss: 1.6135
Validation accuracy: 0.1875

Classification Report:
              precision    recall  f1-score   support

       anger       0.17      0.44      0.25        39
     disgust       0.20      0.50      0.29        38
        fear       0.00      0.00      0.00        38
   happiness       0.00      0.00      0.00        38
     sadness       0.00      0.00      0.00        39

    accuracy                           0.19       192
   macro avg       0.08      0.19      0.11       192
weighted avg       0.08      0.19      0.11       192

New best model saved with accuracy: 0.1875

Epoch 2/4
Average training loss: 1.6144


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.75it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average validation loss: 1.6177
Validation accuracy: 0.2135

Classification Report:
              precision    recall  f1-score   support

       anger       0.23      0.56      0.33        39
     disgust       0.19      0.50      0.28        38
        fear       0.00      0.00      0.00        38
   happiness       0.00      0.00      0.00        38
     sadness       0.00      0.00      0.00        39

    accuracy                           0.21       192
   macro avg       0.09      0.21      0.12       192
weighted avg       0.09      0.21      0.12       192

New best model saved with accuracy: 0.2135

Epoch 3/4
Average training loss: 1.6211


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.68it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average validation loss: 1.6085
Validation accuracy: 0.2240

Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        39
     disgust       0.00      0.00      0.00        38
        fear       0.00      0.00      0.00        38
   happiness       0.20      0.50      0.29        38
     sadness       0.24      0.62      0.35        39

    accuracy                           0.22       192
   macro avg       0.09      0.22      0.13       192
weighted avg       0.09      0.22      0.13       192

New best model saved with accuracy: 0.2240

Epoch 4/4
Average training loss: 1.6061


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.64it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average validation loss: 1.6087
Validation accuracy: 0.1979

Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        39
     disgust       0.20      0.50      0.29        38
        fear       0.19      0.50      0.28        38
   happiness       0.00      0.00      0.00        38
     sadness       0.00      0.00      0.00        39

    accuracy                           0.20       192
   macro avg       0.08      0.20      0.11       192
weighted avg       0.08      0.20      0.11       192


Evaluating model on the test dataset...


Testing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15.56it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average test loss: 1.6150

Test Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        38
     disgust       0.17      0.49      0.26        39
        fear       0.17      0.37      0.23        38
   happiness       0.00      0.00      0.00        39
     sadness       0.00      0.00      0.00        38

    accuracy                           0.17       192
   macro avg       0.07      0.17      0.10       192
weighted avg       0.07      0.17      0.10       192



In [13]:
# with explanations
TRAIN_DF['text'] = TRAIN_DF['text'] + "\nContext:" + TRAIN_DF['Response']
main()


Splitting training dataset...
Initializing BERT model and tokenizer...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/4
Step 50 - Average training loss: 0.9888
Step 100 - Average training loss: 0.8318
Step 150 - Average training loss: 0.7487
Step 200 - Average training loss: 0.6898
Step 250 - Average training loss: 0.6364
Step 300 - Average training loss: 0.5976
Step 350 - Average training loss: 0.5676
Step 400 - Average training loss: 0.5415
Average training loss: 0.5406


Validation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:16<00:00,  3.19it/s]


Average validation loss: 0.3146
Validation accuracy: 0.8909

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.87      0.92      0.89       618
      normal       0.87      0.93      0.90       832
   offensive       0.95      0.80      0.87       566

    accuracy                           0.89      2016
   macro avg       0.90      0.88      0.89      2016
weighted avg       0.89      0.89      0.89      2016

New best model saved with accuracy: 0.8909

Epoch 2/4
Step 50 - Average training loss: 0.2984
Step 100 - Average training loss: 0.3026
Step 150 - Average training loss: 0.3100
Step 200 - Average training loss: 0.3111
Step 250 - Average training loss: 0.3087
Step 300 - Average training loss: 0.3030
Step 350 - Average training loss: 0.3016
Step 400 - Average training loss: 0.2989
Average training loss: 0.2996


Validation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:16<00:00,  3.19it/s]


Average validation loss: 0.2934
Validation accuracy: 0.8914

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.87      0.92      0.89       618
      normal       0.88      0.93      0.90       832
   offensive       0.94      0.81      0.87       566

    accuracy                           0.89      2016
   macro avg       0.90      0.89      0.89      2016
weighted avg       0.89      0.89      0.89      2016

New best model saved with accuracy: 0.8914

Epoch 3/4
Step 50 - Average training loss: 0.2280
Step 100 - Average training loss: 0.2391
Step 150 - Average training loss: 0.2349
Step 200 - Average training loss: 0.2303
Step 250 - Average training loss: 0.2325
Step 300 - Average training loss: 0.2338
Step 350 - Average training loss: 0.2332
Step 400 - Average training loss: 0.2333
Average training loss: 0.2330


Validation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:15<00:00,  3.19it/s]


Average validation loss: 0.3072
Validation accuracy: 0.8938

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.84      0.96      0.89       618
      normal       0.93      0.88      0.91       832
   offensive       0.91      0.84      0.87       566

    accuracy                           0.89      2016
   macro avg       0.89      0.89      0.89      2016
weighted avg       0.90      0.89      0.89      2016

New best model saved with accuracy: 0.8938

Epoch 4/4
Step 50 - Average training loss: 0.1627
Step 100 - Average training loss: 0.1639
Step 150 - Average training loss: 0.1639
Step 200 - Average training loss: 0.1682
Step 250 - Average training loss: 0.1666
Step 300 - Average training loss: 0.1717
Step 350 - Average training loss: 0.1715
Step 400 - Average training loss: 0.1707
Average training loss: 0.1707


Validation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:16<00:00,  3.18it/s]


Average validation loss: 0.3194
Validation accuracy: 0.9053

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.88      0.94      0.91       618
      normal       0.92      0.92      0.92       832
   offensive       0.92      0.84      0.88       566

    accuracy                           0.91      2016
   macro avg       0.91      0.90      0.90      2016
weighted avg       0.91      0.91      0.90      2016

New best model saved with accuracy: 0.9053

Evaluating model on the test dataset...


Testing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:11<00:00,  4.45it/s]


Average test loss: 1.1769

Test Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.62      0.82      0.71       672
      normal       0.55      0.83      0.66       672
   offensive       0.81      0.14      0.24       672

    accuracy                           0.60      2016
   macro avg       0.66      0.60      0.53      2016
weighted avg       0.66      0.60      0.53      2016

