In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

def lineReader: Merupakan fungsi untuk membaca text pergaris pada dataset

In [2]:
def lineReader(x):
  with open(x) as file: 
    line = []
    for lines in file.readlines():
      line.append(lines)
    return line

def csv: merupakan fungsi untuk memecah 1 row pada dataset menjadi feature dan label

In [3]:
def csv(line):
  list1,list2 = [],[]
  for lines in line:
    x,y = lines.split(';')
    y = y.replace('\n','')
    list1.append(x)
    list2.append(y)
  df = pd.DataFrame(list(list1),columns=['sentence'])
  df['emotion'] = list2
  return df

In [4]:
# Load Dataset
line = lineReader('./train.txt')
df = csv(line)

In [5]:
df

Unnamed: 0,sentence,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [6]:
df.emotion.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: emotion, dtype: int64

WordNetLemmatizer berfungsi untuk melakukan lemmatizing pada dataset. Lemmatizing bertujuan untuk merubah kata-kata menjadi bentuk awal.

In [7]:
# nltk.download('wordnet')
# nltk.download('stopwords')
wn = WordNetLemmatizer()

def lem: merupakan fungsi untuk melakukan lematisasi terhadap kata-kata yang ada pada dataset. Menggunakan corpus stopword.

In [8]:
def lem(x):
  corpus = []
  i=1
  for words in x:
    words = words.split()
    y = [wn.lemmatize(word) for word in words if not word in stopwords.words('english')]
    y =  ' '.join(y)
    corpus.append(y)
  return corpus
x = lem(df['sentence'])

In [9]:
test_line = lineReader('./test.txt') 
test_df = csv(test_line)

In [10]:
x_test = lem(test_df['sentence'])
all = x + x_test

In [11]:
# Splitting Label
y_train = df.iloc[:,1].values
y_test = test_df.iloc[:,1].values

LSTM

In [12]:
from tensorflow.keras.layers import Embedding,LSTM,Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [13]:
# Mengubah train dan test dataset menjadi bentuk DataFrame
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

Pada cell ini, kita melakukan tokenizer pada seluruh text. Tokenizer sendiri adalah proses dimana kata-kata dibagi menjadi bagian-bagian yang lebih kecil dan dibagi menjadi 3 bagian besar yaitu word, character dan subword.

In [14]:
tokenizer = Tokenizer(num_words=10000, split=' ')
tokenizer.fit_on_texts(all)

In [15]:
X1 = tokenizer.texts_to_sequences(all)

Kita menyamakan length atau panjang dari tiap kalimat menggunakan padding function.

In [16]:
X1 = pad_sequences(X1,maxlen=20,padding='post',truncating='post')
Y1 = pd.get_dummies(y_train).values

In [17]:
X1.shape

(18000, 20)

In [18]:
# Splitting Feature
X_train = X1[:16000]
X_test = X1[16000:]

Pada Y_Test digunakkan get_dummies untuk mengubah kategorikal variable menjadi dummies indicator value.

In [19]:
Y_train = Y1
Y_test = pd.get_dummies(y_test).values

Disini kita membuat model kita. Dalam Model ini, kita menggunakan word embedding untuk melakukan koversi kata yang berupa karakter menjadi berbentuk vektor. Tujuannya agar dengan vektor tersebut, mesin dapat mengetahui persamaan atau kemiripan dari semantic meaning pada tiap-tiap text. Lalu model kita juga menggunakan LSTM (Long Short Term Memmory) yang berguna untuk mengklasifikasikan dataset yang kita punya dengan menggunakan neuron sebanyak 256. Lalu output layer terakhir menggunakan neuron sebanyak 6 neuron yang mewakili dari banyaknya label yaitu 6 label dengan activation function softmax.

In [20]:
model = Sequential()
model.add(Embedding(input_dim=10000,output_dim = 64,input_length=20))
model.add(LSTM(256))
model.add(Dense(6,activation='softmax'))

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 64)            640000    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               328704    
_________________________________________________________________
dense (Dense)                (None, 6)                 1542      
Total params: 970,246
Trainable params: 970,246
Non-trainable params: 0
_________________________________________________________________


Model dicompile dan di fit menggunakan adam optimizer. Untuk loss function kami menggunakan mean squared error (MSE). Untuk metrics menggunakan accuracy.

In [22]:
model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
model.fit(X_train,Y_train,batch_size=32,epochs=10,verbose=2,validation_split=0.2)

Epoch 1/10
400/400 - 20s - loss: 0.1101 - accuracy: 0.4457 - val_loss: 0.0728 - val_accuracy: 0.6675
Epoch 2/10
400/400 - 17s - loss: 0.0467 - accuracy: 0.8098 - val_loss: 0.0361 - val_accuracy: 0.8559
Epoch 3/10
400/400 - 17s - loss: 0.0254 - accuracy: 0.8976 - val_loss: 0.0325 - val_accuracy: 0.8772
Epoch 4/10
400/400 - 18s - loss: 0.0180 - accuracy: 0.9275 - val_loss: 0.0271 - val_accuracy: 0.8919
Epoch 5/10
400/400 - 17s - loss: 0.0157 - accuracy: 0.9383 - val_loss: 0.0255 - val_accuracy: 0.8975
Epoch 6/10
400/400 - 17s - loss: 0.0125 - accuracy: 0.9491 - val_loss: 0.0236 - val_accuracy: 0.9041
Epoch 7/10
400/400 - 17s - loss: 0.0100 - accuracy: 0.9622 - val_loss: 0.0245 - val_accuracy: 0.9091
Epoch 8/10
400/400 - 17s - loss: 0.0092 - accuracy: 0.9647 - val_loss: 0.0276 - val_accuracy: 0.8956
Epoch 9/10
400/400 - 18s - loss: 0.0085 - accuracy: 0.9662 - val_loss: 0.0237 - val_accuracy: 0.9069
Epoch 10/10
400/400 - 17s - loss: 0.0059 - accuracy: 0.9777 - val_loss: 0.0248 - val_accura

<tensorflow.python.keras.callbacks.History at 0x1aeae44ad88>

Disini kita sudah dapat melihat loss hanya sebesar 0.026 dan accuracy sudah hampir menyentuh 90%

In [23]:
loss,acc = model.evaluate(X_test,Y_test)



In [24]:
modelPredict = model.predict_classes(X_test)



In [25]:
Y_pred = np.argmax(Y_test, axis=1)

In [26]:
from sklearn.metrics import classification_report

print(classification_report(Y_pred, modelPredict))



              precision    recall  f1-score   support

           0       0.88      0.88      0.88       275
           1       0.88      0.85      0.86       224
           2       0.93      0.92      0.92       695
           3       0.74      0.83      0.78       159
           4       0.94      0.94      0.94       581
           5       0.65      0.62      0.64        66

    accuracy                           0.90      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.90      0.90      0.90      2000



def predict merupakan function yang berguna untuk melakukan lemmetizing, tokenizing, padding pada evaluation dataset.

In [27]:
def predict(sentence, model):
    sentence = lem(sentence)
    sentence = tokenizer.texts_to_sequences(sentence)
    sentence = pad_sequences(sentence,maxlen=20,padding='post',truncating='post')
    res = model.predict_classes(sentence)
    return res

In [28]:
text = lineReader('./val.txt') 
text = pd.DataFrame(text)

Disini kita dapat melihat label yang ada pada evaluation dataset sudah hampir sepenuhnya dapat diklasifikasi oleh model yang kami buat.

In [29]:
for idx, i in enumerate(predict(text[0], model)):
    if   i == 5:
        print(f'{text[0][idx]}: Surprise')
    elif i == 4:
        print(f'{text[0][idx]}: Sadness')
    elif i == 3:
        print(f'{text[0][idx]}: Love')
    elif i == 2:
        print(f'{text[0][idx]}: Joy')
    elif i == 1:
        print(f'{text[0][idx]}: Fear')
    elif i == 0:
        print(f'{text[0][idx]}: Anger')



im feeling quite sad and sorry for myself but ill snap out of it soon;sadness
: Sadness
i feel like i am still looking at a blank canvas blank pieces of paper;sadness
: Sadness
i feel like a faithful servant;love
: Joy
i am just feeling cranky and blue;anger
: Anger
i can have for a treat or if i am feeling festive;joy
: Joy
i start to feel more appreciative of what god has done for me;joy
: Joy
i am feeling more confident that we will be able to take care of this baby;joy
: Joy
i feel incredibly lucky just to be able to talk to her;joy
: Joy
i feel less keen about the army every day;joy
: Joy
i feel dirty and ashamed for saying that;sadness
: Sadness
i feel bitchy but not defeated yet;anger
: Anger
i was dribbling on mums coffee table looking out of the window and feeling very happy;joy
: Joy
i woke up often got up around am feeling pukey radiation and groggy;sadness
: Sadness
i was feeling sentimental;sadness
: Sadness
i walked out of there an hour and fifteen minutes later feeling l