Importing the necessary libraries

In [None]:
import tensorflow as tf
import os
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np


Defining the necessary functions for loading the data and preprocessing the data

In [None]:
def load_data(file):
    data = []
    with open(file, 'r') as f:
        for line in f:
                line = line.strip()
                label = ' '.join(line[1:line.find("]")].strip().split())
                text = line[line.find("]")+1:].strip()
                data.append([label, text])
    return data
file = 'text.txt'
data = load_data(file)
print("Number of instances: {}".format(len(data)))
def tokenizer(token, n):
    output = []
    for i in range(len(token)-n+1):
        tokens = ' '.join(token[i:i+n])
        output.append(tokens)
    return output
def create_feature(text, nrange=(1, 1)):
    text_features = []
    text = text.lower()
    text_alphanum = re.sub('[^a-z)-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1):
        text_features += tokenizer(text_alphanum.split(), n)
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += tokenizer(text_punc.split(), 1)
    return Counter(text_features)
def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

Loading the data and creating the feature vector

In [None]:
y_all = []
X_all = []
for label, text in data:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))
count = {}
for i in emotions:
    count[i] = 0
for i in range(len(data)):
    count[y_all[i]] += 1
print(count)

Splitting the data into training , validation and testing set

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(X_all, y_all, test_size=0.3, random_state=90)
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=0.3, random_state=90)
len_Tx = len(train_texts)
len_Ty = len(train_labels)
len_tx = len(test_texts)
len_ty = len(test_labels)
len_vx = len(val_texts)
len_vy = len(val_labels)
print({
    "X_train": len_Tx,
    "y_train": len_Ty,
    "X_test": len_tx,
    "y_test": len_ty,
    "X_val": len_vx,
    "y_val": len_vy
})


In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

batch_size = 32
learning_rate = 1e-5
num_epochs = 3


train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': train_texts}, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=len(train_texts)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': val_texts}, val_labels))
val_dataset = val_dataset.batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': test_texts}, test_labels))
test_dataset = test_dataset.batch(batch_size)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

results = model.evaluate(test_dataset)
print("Test accuracy:", results[1])


this is for demonstration