### Import Libraries

In [24]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

### Load data

In [25]:
# Read the dataset
df = pd.read_csv('emotions.csv')
df.head()

Unnamed: 0,text,label
0,i feel so pissed off over an old friend and so...,anger
1,ive found it has made a huge difference especi...,anger
2,i also feel it is unfortunate that nearly all ...,sadness
3,i feel petty a href http clairee,anger
4,i used to believe that a feeling like fear was...,sadness


In [17]:
df.info

<bound method DataFrame.info of                                                     text    label
0      i feel so pissed off over an old friend and so...    anger
1      ive found it has made a huge difference especi...    anger
2      i also feel it is unfortunate that nearly all ...  sadness
3                       i feel petty a href http clairee    anger
4      i used to believe that a feeling like fear was...  sadness
...                                                  ...      ...
19995  i was i might be buying stuff from there but i...      joy
19996  i like sonam deepika and genelia who i feel ar...      joy
19997  i feel pathetic that i can hardly go a whole d...  sadness
19998  id have spent more time with her on reading i ...  sadness
19999  i do however feel like one of those pathetic g...  sadness

[20000 rows x 2 columns]>

In [18]:
labels_name= df['label'].unique()
labels_name

array(['anger', 'sadness', 'joy', 'love', 'fear', 'surprise'],
      dtype=object)

### Preprocessing

In [20]:
# Tokenization
tokenizer = Tokenizer()

# fit the tokenizer on text data to build vocabulary
tokenizer.fit_on_texts(df['text'])

# Convert text to sequnces of integers
X = tokenizer.texts_to_sequences(df['text'])
print(X)

[[1, 2, 14, 513, 161, 115, 76, 274, 277, 3, 68, 178], [73, 322, 12, 99, 131, 6, 1014, 1126, 275, 29, 5, 1770, 24, 10, 2621, 3, 5, 10, 714, 119, 14, 84, 8429, 3, 152, 495], [1, 116, 2, 12, 21, 762, 8, 1228, 35, 5, 964, 9, 87, 4, 782, 5, 367, 57, 27, 2839, 2622, 2442, 50], [1, 2, 665, 6, 203, 187, 8430], [1, 323, 4, 300, 8, 6, 7, 13, 763, 18, 4, 27, 614, 34, 3504, 111, 162, 37, 29, 22, 11, 6, 242], [1, 492, 11, 5, 213, 3505, 1, 125, 1228, 220, 363, 463, 22, 51, 890, 6, 3506, 3, 7, 28, 48, 10, 2048, 237, 27, 6, 54, 545, 59, 5, 292], [1, 64, 4, 31, 4770, 50, 11, 5, 1475, 3, 4771, 9, 5, 8431, 1, 69, 2, 45, 678, 26, 147, 447, 1699, 3, 5, 183, 44, 111, 58], [1, 91, 876, 1060, 22, 5972, 7, 28, 522, 20, 12, 18, 76, 8432, 795, 9, 3507], [1, 4772, 1, 91, 2, 50, 749, 22, 367, 71, 18, 14, 2049, 3, 56, 6, 371, 1229, 37, 3, 37], [1, 5, 3508, 230, 4, 8433, 5, 850, 4, 2, 585, 4, 8434, 16, 3509, 3, 4, 5973, 8435], [15, 25, 7, 750, 32, 35, 773, 1, 38, 600, 41, 8436, 11, 5, 225], [1, 1639, 5, 891, 936, 1,

In [23]:
# Pad sequnce 
X = pad_sequences(X, padding='post', maxlen=100)
print(X)

[[  1   2  14 ...   0   0   0]
 [ 73 322  12 ...   0   0   0]
 [  1 116   2 ...   0   0   0]
 ...
 [  1   2 579 ...   0   0   0]
 [247  19 404 ...   0   0   0]
 [  1  38 598 ...   0   0   0]]


In [29]:
# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])
print(y)

#One-Hot Encode the labels
y = to_categorical(y)
print(y)

[0 0 4 ... 4 4 4]
[[1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]]


In [33]:
#train,test-split
x_train,x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(f'Shape of the training data: {x_train.shape},{y_train.shape}')
print(f'Shape of the test data: {x_test.shape},{y_test.shape}')
print(f'Shape of the test data: {x_val.shape},{y_val.shape}')


Shape of the training data: (12800, 100),(12800, 6)
Shape of the test data: (4000, 100),(4000, 6)
Shape of the test data: (3200, 100),(3200, 6)
