In [1]:
from google.colab import files
uploaded=files.upload()

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from sklearn.metrics import classification_report

# Load data
Emo_Data_10c = pd.read_csv('/content/Emo_Data_10c.csv')

print(Emo_Data_10c.columns)
print('')
print(Emo_Data_10c.shape)
print(Emo_Data_10c.head())
print('')
Emo_Data_10c.shape

Index(['Text', 'Emotion'], dtype='object')

(89668, 2)
                                                Text  Emotion
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger



(89668, 2)

In [3]:
# Replacing intent text values to numerical
Emo_Data_10c=Emo_Data_10c.replace({"sadness":0,"anger":1,"love":2, "surprise":3, "fear":4,
                         "happiness":5, "neutral":6, "worry":7, "admiration":8, "annoyance":9,"approval":10})
Emo_Data_10c['Emotion'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [4]:
# Split dataset into train, validation, and test sets
x_train, x_test, y_train, y_test = train_test_split(Emo_Data_10c["Text"], Emo_Data_10c["Emotion"],
                                                    test_size=0.1, shuffle=True, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True, random_state=1)
x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape


((72630,), (8071,), (8967,), (72630,), (8071,), (8967,))

In [5]:

# Tokenize and pad
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length=170
x_train_pad = tokenizer.batch_encode_plus(x_train.tolist(), return_tensors='np', padding='max_length')
x_val_pad = tokenizer.batch_encode_plus(x_val.tolist(), return_tensors='np', padding='max_length')
x_test_pad = tokenizer.batch_encode_plus(x_test.tolist(), return_tensors='np', padding='max_length')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Convert BatchEncoding to NumPy arrays
x_train_array = {key: x_train_pad[key] for key in x_train_pad}
x_val_array = {key: x_val_pad[key] for key in x_val_pad}

In [7]:
# One-hot-encode classes
n_classes = len(np.unique(y_train))
y_train_np = y_train.values
y_val_np = y_val.values
y_test_np = y_test.values

y_train_enc = tf.keras.utils.to_categorical(y_train_np, n_classes)
y_val_enc = tf.keras.utils.to_categorical(y_val_np, n_classes)
y_test_enc = tf.keras.utils.to_categorical(y_test_np, n_classes)

In [8]:
#datasets check
print(x_val_pad['input_ids'].shape)
print(y_val_enc.shape)
empty_sequences = np.sum(x_val_pad['input_ids']==0, axis=1)==512
print(np.any(empty_sequences))
print(np.any(np.isnan(x_val_pad['input_ids'])))
print(np.any(np.isnan(y_val_enc)))

print(x_train_pad['input_ids'].shape)
print(y_train_enc.shape)
empty_sequences = np.sum(x_train_pad['input_ids']==0, axis=1)==512
print(np.any(empty_sequences))
print(np.any(np.isnan(x_train_pad['input_ids'])))
print(np.any(np.isnan(y_train_enc)))

(8071, 512)
(8071, 11)
False
False
False
(72630, 512)
(72630, 11)
False
False
False


In [9]:
# Create BERT model
from transformers import TFBertForSequenceClassification
model_BERT = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=11)
model_BERT.summary()

#Freeze all layers except the classification layer
for layer in model_BERT.layers[:-1]:
    layer.trainable = False

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  8459      
                                                                 
Total params: 109490699 (417.67 MB)
Trainable params: 109490699 (417.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
from tensorflow.keras.callbacks import EarlyStopping

#compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
metric = tf.keras.metrics.CategoricalAccuracy()
#keras_callbacks = [EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0.0001)]
model_BERT.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [None]:
# train model with callbacks.

# history = model_BERT.fit(x_train_array, y_train_enc, batch_size=16, epochs=5,
#                          validation_data=(x_val_array, y_val_enc),
#                          callbacks=[keras_callbacks])


history = model_BERT.fit(x_train_array, y_train_enc, batch_size=64, epochs=5,
                         validation_data=(x_val_array, y_val_enc))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 222/1135 [====>.........................] - ETA: 12:55 - loss: 8.0377 - categorical_accuracy: 0.1096