In [1]:
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset,RandomSampler,SequentialSampler
#from transformers import DistilBertTokenizer, DistlBertForTokenClassification,get_linear_schedule_with_warmup
from transformers import DistilBertTokenizer, DistilBertModel, BertPreTrainedModel, DistilBertPreTrainedModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from torch.optim import AdamW

In [2]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
train_df = pd.read_csv('/content/train.txt',sep=';')
test_df = pd.read_csv('/content/test.txt',sep=';')
val_df = pd.read_csv('/content/val.txt',sep=';')

In [4]:
train_df.shape,test_df.shape,val_df.shape

((15999, 2), (1999, 2), (1999, 2))

In [5]:
train_df.columns = ['sentence', 'emotion']
test_df.columns = ['sentence', 'emotion']
val_df.columns = ['sentence', 'emotion']

In [6]:
#
train_df.emotion.value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,5362
sadness,4665
anger,2159
fear,1937
love,1304
surprise,572


In [7]:
train_df.head()

Unnamed: 0,sentence,emotion
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


In [8]:
MODEL_OUT_DIR = '/kaggle/working/models/bert_emotion'
TRAIN_FILE_PATH = '../input/emotions-dataset-for-nlp/train.txt'
VALID_FILE_PATH = '../input/emotions-dataset-for-nlp/val.txt'
TEST_FILE_PATH = '../input/emotions-dataset-for-nlp/test.txt'
## Model Configurations
MAX_LEN_TRAIN = 68
MAX_LEN_VALID = 68
MAX_LEN_TEST = 68
BATCH_SIZE = 160
LR = 1e-5
NUM_EPOCHS = 10
NUM_THREADS = 1  ## Number of threads for collecting dataset
MODEL_NAME = 'distilbert-base-uncased'
LABEL_DICT = {'joy':0, 'sadness':1, 'anger':2, 'fear':3, 'love':4, 'surprise':5}

if not os.path.isdir(MODEL_OUT_DIR):
    os.makedirs(MODEL_OUT_DIR)

In [9]:
class Emotions_Dataset(Dataset):

    def __init__(self, filename, maxlen, tokenizer, label_dict):
        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = ';')
        # name columns
        self.df.columns = ['sentence', 'emotion']
        #Initialize the tokenizer for the desired transformer model
        self.df['emotion'] = self.df['emotion'].map(label_dict)
        self.tokenizer = tokenizer
        #Maximum length of the tokens list to keep all the sequences of fixed size
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        #Select the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'emotion']
        #Preprocess the text to be suitable for the transformer
        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        #Obtain the indices of the tokens in the BERT Vocabulary
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor(input_ids)
        #Obtain the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attention_mask = (input_ids != 0).long()

        label = torch.tensor(label, dtype=torch.long)

        return input_ids, attention_mask, label

In [10]:
import torch.nn as nn

class DistilBertEmotionClassifier(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)
        #The classification layer that takes the [CLS] representation and outputs the logit
        self.cls_layer = nn.Linear(config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        #Feed the input to Bert model to obtain contextualized representations
        reps = self.distilbert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        #Obtain the representations of [CLS] heads
        cls_reps = reps[:, 0]
        logits = self.cls_layer(cls_reps)
        return logits

In [11]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
    best_acc = 0
    for epoch in trange(epochs, desc="Epoch"):
        model.train()
        train_acc = 0
        for i, (input_ids, attention_mask, labels) in enumerate(iterable=train_loader):
            optimizer.zero_grad()

            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_acc += get_accuracy_from_logits(logits, labels)

        print(f"Training accuracy is {train_acc/len(train_loader)}")
        val_acc, val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
        print("Epoch {} complete! Validation Accuracy : {}, Validation Loss : {}".format(epoch, val_acc, val_loss))


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
def evaluate(model, criterion, dataloader, device):
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0
#     predicted_labels = []
#     actual_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in (dataloader):

            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            logits = model(input_ids, attention_mask)

            mean_loss += criterion(logits.squeeze(-1), labels).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

#             predicted_labels += output
#             actual_labels += labels

    return mean_acc/count, mean_loss/count

In [14]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=1)
    output = torch.argmax(probs, dim=1)
    acc = (output == labels).float().mean()
    return acc

In [15]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in (dataloader):

            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            logits = model(input_ids, attention_mask)

            probs = F.softmax(logits, dim=1)
            output = torch.argmax(probs, dim=1)

            predicted_label += output
            actual_label += labels

    return predicted_label, actual_label

In [16]:
from transformers import AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
import torch.optim as optim
import torch.nn.functional as F

## Configuration loaded from AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME)
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
## Creating the model from the desired transformer model
model = DistilBertEmotionClassifier.from_pretrained(MODEL_NAME, config=config)
## GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Takes as the input the logits of the positive class and computes the binary cross-entropy
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
## Optimizer
optimizer = AdamW(model.parameters(), lr=LR)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertEmotionClassifier were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['cls_layer.bias', 'cls_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
## Training Dataset
train_set = Emotions_Dataset(filename='/content/train.txt', maxlen=MAX_LEN_TRAIN, tokenizer=tokenizer, label_dict=LABEL_DICT)
valid_set = Emotions_Dataset(filename='/content/val.txt', maxlen=MAX_LEN_VALID, tokenizer=tokenizer, label_dict=LABEL_DICT)
test_set = Emotions_Dataset(filename='/content/test.txt', maxlen=MAX_LEN_TEST, tokenizer=tokenizer, label_dict=LABEL_DICT)


## Data Loaders
train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)
valid_loader = DataLoader(dataset=valid_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)

# print(len(train_loader))

In [18]:
train(model=model,
      criterion=criterion,
      optimizer=optimizer,
      train_loader=train_loader,
      val_loader=valid_loader,
      epochs = 5,
     device = device)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Training accuracy is 0.5331736207008362
Epoch 0 complete! Validation Accuracy : 0.7666078805923462, Validation Loss : 0.7059477705221909
Training accuracy is 0.8485571146011353
Epoch 1 complete! Validation Accuracy : 0.9085565209388733, Validation Loss : 0.2794304283765646
Training accuracy is 0.9219328165054321
Epoch 2 complete! Validation Accuracy : 0.9196021556854248, Validation Loss : 0.2126129109125871
Training accuracy is 0.9370584487915039
Epoch 3 complete! Validation Accuracy : 0.9253591299057007, Validation Loss : 0.18792220033132112
Training accuracy is 0.9464951753616333
Epoch 4 complete! Validation Accuracy : 0.9306718707084656, Validation Loss : 0.1688883235821357


In [19]:
def predict_emotion(sentence, model, tokenizer, label_dict, maxlen, device):
    model.eval()
    with torch.no_grad():
        # Preprocess the sentence
        tokens = tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < maxlen:
            tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))]
        else:
            tokens = tokens[:maxlen-1] + ['[SEP]']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)

        attention_mask = (input_ids != 0).long().to(device)

        # Get logits from the model
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get predicted label
        probs = F.softmax(logits, dim=1)
        output = torch.argmax(probs, dim=1).item()

        # Map label index back to emotion string
        id_to_label = {v: k for k, v in label_dict.items()}
        predicted_emotion = id_to_label[output]

        return predicted_emotion

In [40]:
emoji_map = {
    "joy": "😊",
    "sadness": "😢",
    "anger": "😡",
    "fear": "😨",
    "surprise": "😲",
    "love": "❤️"
}



In [35]:
sample_sentence ="I am very lonely today"
predicted_emotion = predict_emotion(sample_sentence, model, tokenizer, LABEL_DICT, MAX_LEN_TEST, device)
print(f"The predicted emotion for the sentence '{sample_sentence}' is: {predicted_emotion}")

The predicted emotion for the sentence 'I am very lonely today' is: sadness


In [38]:
def predict_text_emoji(text):
    predicted_emotion = predict_emotion(text, model, tokenizer, LABEL_DICT, MAX_LEN_TEST, device)
    return emoji_map.get(predicted_emotion, "🙂")

In [45]:
sample_sentence = "I hate you but in a good way"
predicted_emotion = predict_emotion(sample_sentence, model, tokenizer, LABEL_DICT, MAX_LEN_TEST, device)
predicted_emoji = predict_text_emoji(sample_sentence)  # This should return the emoji
print(f"The predicted emotion for the sentence '{sample_sentence}' is: {predicted_emotion}")
print(f"The corresponding emoji is: {predicted_emoji}")

The predicted emotion for the sentence 'I hate you but in a good way' is: anger
The corresponding emoji is: 😡
