In [1]:
# Untuk manipulasi data
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Untuk nlp
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asnaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
pwd = os.getcwd()
df = pd.read_csv(pwd + '\data\Dataset_Capstone1.csv')
df

Unnamed: 0,Text,Mood
0,i was angry when my boyfriend did not turn up ...,anger
1,ive been feeling kind of bitchy lately,anger
2,i am relieved but now burden with the thing ca...,anger
3,i love not feeling rushed,anger
4,i need to run longer distances without feeling...,anger
...,...,...
39995,next weeks dlc is fail cant wait for maiden i...,worry
39996,just finished curling her hair and now has to ...,worry
39997,organic chemistry ah so confusing,worry
39998,and all woman who transfer their first impress...,worry


In [3]:
df.shape

(40000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    40000 non-null  object
 1   Mood    40000 non-null  object
dtypes: object(2)
memory usage: 625.1+ KB


In [5]:
pd.DataFrame(df.isnull().sum(), columns = ["count"])

Unnamed: 0,count
Text,0
Mood,0


In [6]:
df.duplicated().sum()

0

In [7]:
df[df.duplicated()]

Unnamed: 0,Text,Mood


In [8]:
# Dictionary untuk memetakan label mood ke angka
mood_to_number = {
    'anger': 0,
    'fear': 1,
    'happiness': 2,
    'joy': 3,
    'love': 4,
    'neutral': 5,
    'sadness': 6,
    'worry': 7
}

# Mengganti label mood menjadi angka
df['Mood'] = df['Mood'].replace(mood_to_number)

# Menampilkan DataFrame yang telah dimodifikasi
df

  df['Mood'] = df['Mood'].replace(mood_to_number)


Unnamed: 0,Text,Mood
0,i was angry when my boyfriend did not turn up ...,0
1,ive been feeling kind of bitchy lately,0
2,i am relieved but now burden with the thing ca...,0
3,i love not feeling rushed,0
4,i need to run longer distances without feeling...,0
...,...,...
39995,next weeks dlc is fail cant wait for maiden i...,7
39996,just finished curling her hair and now has to ...,7
39997,organic chemistry ah so confusing,7
39998,and all woman who transfer their first impress...,7


In [9]:
# Melakukan stemming, penghapusan tanda baca, dan penghapusan stopwords
ps = PorterStemmer()
corpus = []
for i in range(len(df["Text"])):
  # Penghapusan tanda baca
  review = re.sub("[^a-zA-Z]", " ", df["Text"][i])

  # Penghapusan emoji dan emoticon
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
  review = emoji_pattern.sub(r'', review)

  # Membuat kalimat menjadi huruf kecil dan memisahnya menjadi kata
  review = review.lower()
  review = review.split()

  # Penghapusan stop words dan stemming
  review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
  review = " ".join(review)
  corpus.append(review)

df["Text"] = corpus

In [10]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

In [33]:
# Membagi data menjadi train dan validation
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(df["Text"], df["Mood"], test_size = 0.2, stratify = df["Mood"])

# Melakukan fitting pada Tokenizer untuk mendapatkan word_index
tokenizer = Tokenizer(num_words = 25000, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Melakukan sequencing untuk menerjemahkan kedalam word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Melakukan padding untuk menyeragamkan ukuran input ke neural network
training_padded = pad_sequences(training_sequences, maxlen = 120, padding = "post")
testing_padded = pad_sequences(testing_sequences, maxlen = 120, padding = "post")

In [34]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras import regularizers

In [48]:
# Membuat model neural network
model = tf.keras.Sequential([
    Embedding(input_dim = 25000, output_dim = 400, input_length = 120),
    Bidirectional(tf.keras.layers.LSTM(40, kernel_regularizer=regularizers.l2(0.001), return_sequences=True)),
    Bidirectional(tf.keras.layers.LSTM(10, kernel_regularizer=regularizers.l2(0.001))),
    tf.keras.layers.Dropout(0.5),
    Dense(8, activation = "softmax")])


In [51]:
from tensorflow.keras.optimizers.schedules import ExponentialDecay

initial_learning_rate = 0.001
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), metrics=["accuracy"])


In [52]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 10)

# Melakukan trainning
num_epochs = 100
history = model.fit(training_padded, training_labels, epochs = num_epochs, batch_size = 512, validation_data = (testing_padded, testing_labels), callbacks = [early_stopping], verbose = 2)

Epoch 1/100
63/63 - 73s - 1s/step - accuracy: 0.2348 - loss: 2.1809 - val_accuracy: 0.2253 - val_loss: 1.7878
Epoch 2/100
63/63 - 55s - 868ms/step - accuracy: 0.3091 - loss: 1.7595 - val_accuracy: 0.4767 - val_loss: 1.4772
Epoch 3/100
63/63 - 55s - 867ms/step - accuracy: 0.4225 - loss: 1.4825 - val_accuracy: 0.5242 - val_loss: 1.3267
Epoch 4/100
63/63 - 59s - 940ms/step - accuracy: 0.5184 - loss: 1.3242 - val_accuracy: 0.5934 - val_loss: 1.2084
Epoch 5/100
63/63 - 59s - 929ms/step - accuracy: 0.5966 - loss: 1.1553 - val_accuracy: 0.6044 - val_loss: 1.0900
Epoch 6/100
63/63 - 59s - 944ms/step - accuracy: 0.6868 - loss: 0.9495 - val_accuracy: 0.6913 - val_loss: 1.0075
Epoch 7/100
63/63 - 56s - 882ms/step - accuracy: 0.7868 - loss: 0.7847 - val_accuracy: 0.6964 - val_loss: 0.9693
Epoch 8/100
63/63 - 56s - 886ms/step - accuracy: 0.8422 - loss: 0.6643 - val_accuracy: 0.7122 - val_loss: 0.9605
Epoch 9/100
63/63 - 56s - 883ms/step - accuracy: 0.8753 - loss: 0.5670 - val_accuracy: 0.7090 - val