In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'meld-text:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4886147%2F8237522%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240426%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240426T150632Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D660c3b38fad08f4f33daecbedde14358745dfa50ef062334d2d03c72d99e137b03d74d2e2d68ee0a46d88aecd9153156c17fcc999cc43e2ce53d7e0461203cd51148b1415a3039633a49158c084e9d2297be860ce46097b5bb34e9b169ad54b15d9814efc054d87dc5217f20c05707ef8e857473543b7a41e7f9fb89b4305200a7fcefac8148c29494960896cfcc0c7d1a7982e843f45afe4ae78447651ed731abdd4b2eb25849edca06a0ed4767ee2c9099f3ed3d95d2248c85939233a045a0b5c58be2f2bb31b102fec77babbaa96a66cbd951c76a67c17b35126d46ff2a950405eff86e02a2af68ee071ff6eb37daf2104e2b5d78142c149c005624a3b9f9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
!nvidia-smi

Fri Apr 26 14:08:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0              27W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

In [None]:
# Set seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Load data
train_data = pd.read_csv("/kaggle/input/meld-text/train_sent_emo.csv")
dev_data = pd.read_csv("/kaggle/input/meld-text/dev_sent_emo.csv")
test_data = pd.read_csv("/kaggle/input/meld-text/test_sent_emo.csv")

In [None]:
# Map emotion labels to integers
label_dict = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}
train_data['Emotion'] = train_data['Emotion'].map(label_dict)
dev_data['Emotion'] = dev_data['Emotion'].map(label_dict)
test_data['Emotion'] = test_data['Emotion'].map(label_dict)

In [None]:
# Load ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Tokenize data
def tokenize_data(data, tokenizer):
    encoded_data = tokenizer.batch_encode_plus(
        data,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        max_length=50,
        truncation=True,
        return_tensors='pt'
    )
    return encoded_data['input_ids'], encoded_data['attention_mask']

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
input_ids_train, attention_mask_train = tokenize_data(train_data['Utterance'].values, tokenizer)
input_ids_dev, attention_mask_dev = tokenize_data(dev_data['Utterance'].values, tokenizer)
input_ids_test, attention_mask_test = tokenize_data(test_data['Utterance'].values, tokenizer)

In [None]:
labels_train = torch.tensor(train_data['Emotion'].values)
labels_dev = torch.tensor(dev_data['Emotion'].values)
labels_test = torch.tensor(test_data['Emotion'].values)

In [None]:
# Create datasets
dataset_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_mask_dev, labels_dev)
dataset_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)


In [None]:
# Define model
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(label_dict))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [None]:
# Define training parameters
batch_size = 16
epochs = 5
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=(0.1 * (len(dataset_train) // batch_size)), num_training_steps=len(dataset_train) * epochs)




In [None]:
# Create data loaders
kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size, **kwargs)
dataloader_dev = DataLoader(dataset_dev, sampler=SequentialSampler(dataset_dev), batch_size=batch_size, **kwargs)
dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=len(test_data), **kwargs)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import torch

# Training loop
best_val_f1 = 0.0
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    for batch in dataloader_train:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs[0]

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    # Evaluate on validation set
    model.eval()
    predictions, true_labels = [], []

    for batch in dataloader_dev:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    val_f1 = f1_score(np.argmax(predictions, axis=1), true_labels, average='weighted')
    print(f'Epoch {epoch} - Validation F1 Score: {val_f1}')

    # Save the model if validation F1 score improves
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'ALBERT_model.pth')

Epoch 1 - Validation F1 Score: 0.6154549207518676
Epoch 2 - Validation F1 Score: 0.616704868887906
Epoch 3 - Validation F1 Score: 0.616704868887906
Epoch 4 - Validation F1 Score: 0.5953134895503484
Epoch 5 - Validation F1 Score: 0.5953134895503484


In [None]:
# Load the best model for evaluation on test set
model.load_state_dict(torch.load('ALBERT_model.pth'))
model.eval()

# Evaluate on test set
predictions, true_labels = [], []

for batch in dataloader_test:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs[1]
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

In [None]:
# Calculate additional evaluation metrics
test_f1 = f1_score(np.argmax(predictions, axis=1), true_labels, average='weighted')
test_accuracy = accuracy_score(np.argmax(predictions, axis=1), true_labels)
conf_matrix = confusion_matrix(true_labels, np.argmax(predictions, axis=1))
class_report = classification_report(true_labels, np.argmax(predictions, axis=1))

print(f'Test F1 Score: {test_f1}')
print(f'Test Accuracy: {test_accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Test F1 Score: 0.6601915808718849
Test Accuracy: 0.5467432950191571
Confusion Matrix:
[[1175    0    0    0   81    0    0]
 [ 122    0    0    0  159    0    0]
 [  29    0    0    0   21    0    0]
 [ 161    0    0    0   47    0    0]
 [ 150    0    0    0  252    0    0]
 [  43    0    0    0   25    0    0]
 [ 132    0    0    0  213    0    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.94      0.77      1256
           1       0.00      0.00      0.00       281
           2       0.00      0.00      0.00        50
           3       0.00      0.00      0.00       208
           4       0.32      0.63      0.42       402
           5       0.00      0.00      0.00        68
           6       0.00      0.00      0.00       345

    accuracy                           0.55      2610
   macro avg       0.14      0.22      0.17      2610
weighted avg       0.36      0.55      0.43      2610



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import random

# Define emotion labels
emotion_labels = ['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust', 'anger']

# Display model predictions for randomly selected samples
num_samples = 5  # Number of samples to display
sample_indices = random.sample(range(len(predictions)), num_samples)

print("Randomly Selected Samples - Model Predictions vs. True Emotions:")
print("------------------------------------------------------------------")
for idx in sample_indices:
    input_text = test_data['Utterance'].iloc[idx]
    predicted_emotion = emotion_labels[np.argmax(predictions[idx])]
    true_emotion = emotion_labels[true_labels[idx]]
    confidence_score = predictions[idx][np.argmax(predictions[idx])]

    print(f"Sentence: {input_text}")
    print(f"Predicted Emotion: {predicted_emotion} (Confidence: {confidence_score:.2f})")
    print(f"True Emotion: {true_emotion}")
    print("------------------------------------------")


Randomly Selected Samples - Model Predictions vs. True Emotions:
------------------------------------------------------------------
Sentence: Whoa!
Predicted Emotion: joy (Confidence: 1.24)
True Emotion: surprise
------------------------------------------
Sentence: We’re just celebrating that Joey got his health insurance back.
Predicted Emotion: neutral (Confidence: 2.10)
True Emotion: joy
------------------------------------------
Sentence: Yeah!
Predicted Emotion: joy (Confidence: 1.24)
True Emotion: joy
------------------------------------------
Sentence: Okay, so you were trying to play bad this whole time.
Predicted Emotion: neutral (Confidence: 2.10)
True Emotion: surprise
------------------------------------------
Sentence: I broke it.
Predicted Emotion: neutral (Confidence: 2.12)
True Emotion: sadness
------------------------------------------


In [None]:
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, SequentialSampler
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [None]:
# Define model
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(label_dict))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define training parameters
batch_size = 16
epochs = 5
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Compute class weights for weighted loss
class_weights = torch.tensor([1.0, 10.0, 20.0, 30.0, 1.0, 50.0, 10.0]).to(device)  # Adjust weights based on class importance
