In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'meld-text-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4884825%2F8235800%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240426%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240426T114759Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D56a208b6c5bd667513cdaef2cf95f8de77642084037f56b7c351cf6f3b7b4883935d4fc21bb49695a56f4d95c123fe1802fe7da1997268d4ec8559526182a13b358a8552acff27cc2014beb8f53a61e255414f3628bbb68817dc9b2dcd3f532fdb439c6001d8bc7524522dd246bea37b313d15ce4e74ec3c77dfb2e2200cb556484c7f5e71f7bcf47f6da9d54d4268a9521cb53bfd811fc8167cb7c5f4143dcd96b456548ec416e3cca6df3e27d3680b9b23766bb4ed1b89e9af594ecc35d59ed6a39c941ab52da5609c3bfcde3ae7398ac1a7d2ddb59bde327b76c67631041ee042fc06e20ecdb882592788e13434c4376eaf4edc86831474d33c369463e549'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
from sklearn.metrics import f1_score
import numpy as np
import torch


label_dict= {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}
def f1_score_func(preds,labels):
    preds_flat=np.argmax(preds,axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat,preds_flat,average="weighted")

def accuracy_per_class(preds,labels):
    label_dict_inverse={v:k for k, v in label_dict.items()}
    preds_flat=np.argmax(preds,axis=1).flatten()
    labels_flat=labels.flatten()
    totalacc=0
    tot=0
    for label in np.unique(labels_flat):
        y_preds=preds_flat[labels_flat==label]
        y_true=labels_flat[labels_flat==label]
        totalacc+=len(y_preds[y_preds == label])
        tot+=len(y_true)
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')
    print("Acc=",totalacc/tot)

In [None]:
import random
import numpy as np
import torch
import pandas as pd
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler


In [None]:
# Set random seed and device
seed_val = 994
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device("cuda")

In [None]:
# Define hyperparameters
batch_size = 16
epochs = 5
learning_rate = 5e-5
warmup_proportion = 0.1  # Percentage of training steps for warmup


In [None]:
# Load data
train_data = pd.read_csv("/kaggle/input/meld-text-data/train_sent_emo.csv")
dev_data = pd.read_csv("/kaggle/input/meld-text-data/dev_sent_emo (1).csv")
test_data = pd.read_csv("/kaggle/input/meld-text-data/test_sent_emo.csv")

In [None]:
# Define label dictionary
label_dict = {label: i for i, label in enumerate(train_data['Emotion'].unique())}

In [None]:
label_dict

{'neutral': 0,
 'surprise': 1,
 'fear': 2,
 'sadness': 3,
 'joy': 4,
 'disgust': 5,
 'anger': 6}

In [None]:
# Replace labels with numeric IDs
train_data['Emotion'] = train_data['Emotion'].replace(label_dict)
dev_data['Emotion'] = dev_data['Emotion'].replace(label_dict)
test_data['Emotion'] = test_data['Emotion'].replace(label_dict)


  train_data['Emotion'] = train_data['Emotion'].replace(label_dict)
  dev_data['Emotion'] = dev_data['Emotion'].replace(label_dict)
  test_data['Emotion'] = test_data['Emotion'].replace(label_dict)


In [None]:
# Get utterances
utterances_train = train_data["Utterance"].values
utterances_dev = dev_data["Utterance"].values
utterances_test = test_data["Utterance"].values


In [None]:
# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# Tokenize and prepare datasets
encoded_inputs_train = tokenizer(list(utterances_train), padding=True, truncation=True, max_length=50, return_tensors='pt')
encoded_inputs_dev = tokenizer(list(utterances_dev), padding=True, truncation=True, max_length=50, return_tensors='pt')
encoded_inputs_test = tokenizer(list(utterances_test), padding=True, truncation=True, max_length=50, return_tensors='pt')


In [None]:
encoded_inputs_test

{'input_ids': tensor([[ 101, 2339, 2079,  ...,    0,    0,    0],
        [ 101, 2821, 1012,  ...,    0,    0,    0],
        [ 101, 1061, 1005,  ...,    0,    0,    0],
        ...,
        [ 101, 1997, 2607,  ...,    0,    0,    0],
        [ 101, 9018, 2017,  ...,    0,    0,    0],
        [ 101, 1045, 2228,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
input_ids_train = encoded_inputs_train["input_ids"]
attention_mask_train = encoded_inputs_train["attention_mask"]
labels_train = torch.tensor(train_data['Emotion'].values)

In [None]:
input_ids_dev = encoded_inputs_dev["input_ids"]
attention_mask_dev = encoded_inputs_dev["attention_mask"]
labels_dev = torch.tensor(dev_data['Emotion'].values)

In [None]:
input_ids_test = encoded_inputs_test["input_ids"]
attention_mask_test = encoded_inputs_test["attention_mask"]
labels_test = torch.tensor(test_data['Emotion'].values)

In [None]:
# Create datasets
dataset_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_mask_dev, labels_dev)
dataset_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)


In [None]:
device

device(type='cuda')

In [None]:
# Initialize DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_dict))
model.to(device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Create data loaders
kwargs = {'num_workers': 1, 'pin_memory': True}
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size, **kwargs)
dataloader_dev = DataLoader(dataset_dev, sampler=SequentialSampler(dataset_dev), batch_size=batch_size, **kwargs)
dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=len(dataset_test), **kwargs)


In [None]:
# Set optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(dataloader_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * warmup_proportion), num_training_steps=total_steps)




In [None]:
# Function to evaluate model
def evaluate(dataloader):
    model.eval()
    predictions, true_vals = [], []

    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs[1].detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return predictions, true_vals

In [None]:
# Training loop
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader_train, desc=f"Epoch {epoch}", leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        outputs = model(**inputs)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training loss': f"{loss.item() / len(batch):.2f}"})

    # Save model after each epoch (optional)
    torch.save(model.state_dict(), f"./distilbert_finetuned_epoch{epoch}.model")

    # Evaluate on validation set
    predictions_dev, true_vals_dev = evaluate(dataloader_dev)
    val_loss = total_loss / len(dataloader_dev)
    val_f1 = f1_score(np.argmax(predictions_dev, axis=1), true_vals_dev, average='weighted')

    tqdm.write(f"Epoch {epoch}")
    tqdm.write(f"Validation loss: {val_loss:.4f}")
    tqdm.write(f"F1 Score (Weighted): {val_f1:.4f}")


                                                                              

Epoch 1
Validation loss: 2.4194
F1 Score (Weighted): 0.6114


                                                                              

Epoch 2
Validation loss: 2.4249
F1 Score (Weighted): 0.6114


                                                                              

Epoch 3
Validation loss: 2.3855
F1 Score (Weighted): 0.6114


                                                                              

Epoch 4
Validation loss: 2.3985
F1 Score (Weighted): 0.6114


                                                                              

Epoch 5
Validation loss: 2.4193
F1 Score (Weighted): 0.6114


In [None]:
# Evaluate on test set
model.eval()
predictions_test, true_vals_test = evaluate(dataloader_test)
accuracy_per_class(predictions_test, true_vals_test)

Class: neutral
Accuracy: 994/1256

Class: surprise
Accuracy: 160/281

Class: fear
Accuracy: 4/50

Class: sadness
Accuracy: 49/208

Class: joy
Accuracy: 225/402

Class: disgust
Accuracy: 17/68

Class: anger
Accuracy: 133/345

Acc= 0.6061302681992338


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Save the final trained model
torch.save(model.state_dict(), "/kaggle/working/distilbert_finetuned_epoch5.model")

# Evaluate on test set
predictions_test, true_vals_test = evaluate(dataloader_test)

# Compute accuracy
accuracy = accuracy_score(true_vals_test, np.argmax(predictions_test, axis=1))
print(f"Test Accuracy: {accuracy:.4f}")

# Compute classification report
target_names = [label for label in label_dict.keys()]
print(classification_report(true_vals_test, np.argmax(predictions_test, axis=1), target_names=target_names))

# Compute confusion matrix
conf_matrix = confusion_matrix(true_vals_test, np.argmax(predictions_test, axis=1))
print("Confusion Matrix:")
print(conf_matrix)

# Compute macro and micro F1 scores
macro_f1 = f1_score(true_vals_test, np.argmax(predictions_test, axis=1), average='macro')
micro_f1 = f1_score(true_vals_test, np.argmax(predictions_test, axis=1), average='micro')
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")


Test Accuracy: 0.6061
              precision    recall  f1-score   support

     neutral       0.75      0.79      0.77      1256
    surprise       0.47      0.57      0.52       281
        fear       0.13      0.08      0.10        50
     sadness       0.36      0.24      0.29       208
         joy       0.53      0.56      0.54       402
     disgust       0.35      0.25      0.29        68
       anger       0.44      0.39      0.41       345

    accuracy                           0.61      2610
   macro avg       0.43      0.41      0.42      2610
weighted avg       0.59      0.61      0.60      2610

Confusion Matrix:
[[994  62  14  39  86  13  48]
 [ 44 160   1   4  30   2  40]
 [ 16   7   4   9   5   1   8]
 [ 85  17   6  49  23   3  25]
 [ 93  30   2  12 225   3  37]
 [ 22   8   0   4   3  17  14]
 [ 76  53   4  18  52   9 133]]
Macro F1 Score: 0.4169
Micro F1 Score: 0.6061


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_path = "/kaggle/working/distilbert_finetuned_epoch5.model"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to predict emotion from user sentence
def predict_emotion(user_sentence):
    # Tokenize the input sentence
    encoded_user_input = tokenizer.encode_plus(
        user_sentence,
        add_special_tokens=True,
        max_length=50,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    # Move input tensors to the appropriate device (e.g., CUDA if available)
    input_ids = encoded_user_input['input_ids'].to(device)
    attention_mask = encoded_user_input['attention_mask'].to(device)

    # Pass the input tensors through the model to obtain predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Extract predicted class probabilities
    probabilities = torch.softmax(outputs.logits, dim=1).squeeze().cpu().numpy()

    # Determine the predicted class
    predicted_class_idx = np.argmax(probabilities)
    predicted_emotion = list(label_dict.keys())[list(label_dict.values()).index(predicted_class_idx)]

    # Prepare output with emotion predictions and confidence scores
    output = {}
    output['predicted_emotion'] = predicted_emotion
    output['confidence_scores'] = {label: prob for label, prob in zip(label_dict.keys(), probabilities)}

    return output

# Example user sentence
user_sentence = "I wish I was intelligent enough to do this project on my own."

# Predict emotion from the user sentence
result = predict_emotion(user_sentence)

# Print the predicted emotion and confidence scores
print("Predicted Emotion:", result['predicted_emotion'])
print("Confidence Scores:")
for label, score in result['confidence_scores'].items():
    print(f"{label}: {score:.4f}")


OSError: Incorrect path_or_model_id: '/kaggle/working/distilbert_finetuned_epoch5.model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.