In [1]:
# Load EDA Pkgs
import pandas as pd
import numpy as np

# Load Other Important Pkgs
import ktrain
from ktrain import text
from sklearn.model_selection import train_test_split

# Import Data

In [2]:
# Load Dataset
df = pd.read_csv("data/emotion_dataset_raw.csv")

# Define Features & Labels
X = df['Text']
y = df['Emotion']

# Split Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state=42)

class_names = df['Emotion'].unique()

print('size of training set: %s' % (len(X_train)))
print('size of validation set: %s' % (len(X_test)))
print(df.Emotion.value_counts())
print('Different emotions available: %s' % class_names)

df.head()

size of training set: 24354
size of validation set: 10438
joy         11045
sadness      6722
fear         5410
anger        4297
surprise     4062
neutral      2254
disgust       856
shame         146
Name: Emotion, dtype: int64
Different emotions available: ['neutral' 'joy' 'sadness' 'fear' 'surprise' 'anger' 'shame' 'disgust']


Unnamed: 0,Emotion,Text
0,neutral,Why ?
1,joy,Sage Act upgrade on my to do list for tommorow.
2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3,joy,Such an eye ! The true hazel eye-and so brill...
4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...


In [3]:
encoding = {
    'joy': 0,
    'sadness': 1,
    'fear': 2,
    'anger': 3,
    'surprise': 4,
    'neutral': 5,
    'disgust': 6,
    'shame': 7
}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

# Data Preprocessing

In [5]:
'''
The text must be preprocessed in a specific way for use with BERT.
By setting preprocess_mode to 'bert,' the necessary BERT model and 
vocabs will be automatically downloaded. 
'''

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


# Model Building 

In [6]:
# load the pretrained BERT for text classification
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350




done.


In [7]:
# Wrapping the model in a learner 
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [9]:
# Train the model
learner.fit_onecycle(2e-5, 1)



begin training using onecycle policy with max lr of 2e-05...


<keras.callbacks.History at 0x7fd80e6c3040>

In [17]:
# Testing the model by plugging in potnetial user input
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['neutral', 'joy', 'sadness', 'fear', 'surprise', 'anger', 'shame', 'disgust']

In [38]:
message = 'It is always a great pleasure to have a conversation with you.'
prediction = predictor.predict(message)

print('predicted emotion: {}'.format(prediction))


predicted emotion: neutral
