In [2]:
# Untuk manipulasi data
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Untuk nlp
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asnaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pwd = os.getcwd()
df = pd.read_csv(pwd + '\data\Dataset_Capstone2.csv')
df

Unnamed: 0,Text,Mood
0,i just feel so frustrated not knowing what to ...,anger
1,i am thankful for the safety of my loved ones ...,anger
2,i was feeling dangerous and i also put on heel...,anger
3,i feel the more frustrated im becoming,anger
4,i feel they respect me more when they aren t r...,anger
...,...,...
8995,i was working that day and this left me feelin...,sadness
8996,i am happy to report that after four days of f...,sadness
8997,i feel useless helpless i just want to underst...,sadness
8998,i feel rejected and im not qualified or talented,sadness


In [4]:
df.shape

(9000, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9000 non-null   object
 1   Mood    9000 non-null   object
dtypes: object(2)
memory usage: 140.8+ KB


In [6]:
pd.DataFrame(df.isnull().sum(), columns = ["count"])

Unnamed: 0,count
Text,0
Mood,0


In [7]:
df.duplicated().sum()

0

In [8]:
df[df.duplicated()]

Unnamed: 0,Text,Mood


In [9]:
# Dictionary untuk memetakan label mood ke angka
mood_to_number = {
    'anger': 0,
    'fear': 1,
    'happy': 2,
    'joy': 3,
    'love': 4,
    'sadness': 5,
}

# Mengganti label mood menjadi angka
df['Mood'] = df['Mood'].replace(mood_to_number)

# Menampilkan DataFrame yang telah dimodifikasi
df

  df['Mood'] = df['Mood'].replace(mood_to_number)


Unnamed: 0,Text,Mood
0,i just feel so frustrated not knowing what to ...,0
1,i am thankful for the safety of my loved ones ...,0
2,i was feeling dangerous and i also put on heel...,0
3,i feel the more frustrated im becoming,0
4,i feel they respect me more when they aren t r...,0
...,...,...
8995,i was working that day and this left me feelin...,5
8996,i am happy to report that after four days of f...,5
8997,i feel useless helpless i just want to underst...,5
8998,i feel rejected and im not qualified or talented,5


In [10]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [11]:
stop_words = set(stopwords.words("english"))

def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]
    
    return " " .join(text)

def remove_stop_words(text):

    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

def Removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text

def lower_case(text):
    
    text = text.split()

    text=[y.lower() for y in text]
    
    return " " .join(text)

def Removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )
    
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def Removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F912"             # tambahkan emoji 🤒
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
            
def normalize_text(df):
    df.Text=df.Text.apply(lambda text : lower_case(text))
    df.Text=df.Text.apply(lambda text : remove_stop_words(text))
    df.Text=df.Text.apply(lambda text : Removing_numbers(text))
    df.Text=df.Text.apply(lambda text : Removing_punctuations(text))
    df.Text=df.Text.apply(lambda text : Removing_urls(text))
    df.Text=df.Text.apply(lambda text : lemmatization(text))
    df.Text=df.Text.apply(lambda text : remove_emojis(text))
    return df

def normalized_sentence(sentence):
    sentence= lower_case(sentence)
    sentence= remove_stop_words(sentence)
    sentence= Removing_numbers(sentence)
    sentence= Removing_punctuations(sentence)
    sentence= Removing_urls(sentence)
    sentence= lemmatization(sentence)
    sentence= remove_emojis(sentence)
    return sentence



In [12]:
df= normalize_text(df)


In [22]:
# Melakukan stemming, penghapusan tanda baca, dan penghapusan stopwords
ps = PorterStemmer()
corpus = []
for i in range(len(df["Text"])):
  # Penghapusan tanda baca
  review = re.sub("[^a-zA-Z]", " ", df["Text"][i])

  # Penghapusan emoji dan emoticon
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
  review = emoji_pattern.sub(r'', review)

  # Membuat kalimat menjadi huruf kecil dan memisahnya menjadi kata
  review = review.lower()
  review = review.split()

  # Penghapusan stop words dan stemming
  review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
  review = " ".join(review)
  corpus.append(review)

df["Text"] = corpus

In [13]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

In [14]:
# Membagi data menjadi train dan validation
from sklearn.model_selection import train_test_split
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(df["Text"], df["Mood"], test_size = 0.2, stratify = df["Mood"])

# Melakukan fitting pada Tokenizer untuk mendapatkan word_index
tokenizer = Tokenizer(num_words = 25000, oov_token = "<OOV>")
tokenizer.fit_on_texts(pd.concat([training_sentences, testing_sentences], axis=0))
word_index = tokenizer.word_index

# Melakukan sequencing untuk menerjemahkan kedalam word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Melakukan padding untuk menyeragamkan ukuran input ke neural network
training_padded = pad_sequences(training_sequences, maxlen = 100, padding = "post")
testing_padded = pad_sequences(testing_sequences, maxlen = 100, padding = "post")

In [15]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras import regularizers

In [16]:
# Membuat model neural network
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = 25000, output_dim = 300, input_length = 100),
    Bidirectional(tf.keras.layers.LSTM(50, dropout=0.65,recurrent_dropout=0.65,kernel_regularizer=regularizers.l2(0.002), return_sequences=True)),
    Bidirectional(tf.keras.layers.LSTM(20, dropout=0.65,recurrent_dropout=0.65, kernel_regularizer=regularizers.l2(0.002))),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6, activation = "softmax")])





In [17]:
from tensorflow.keras.optimizers.schedules import ExponentialDecay

initial_learning_rate = 0.001
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), metrics=["accuracy"])


In [43]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Melakukan trainning
num_epochs = 100
history = model.fit(training_padded, training_labels, epochs = num_epochs, batch_size = 50, validation_data = (testing_padded, testing_labels), callbacks = [early_stopping], verbose = 2)

Epoch 1/100
144/144 - 30s - 210ms/step - accuracy: 0.1664 - loss: 2.3817 - val_accuracy: 0.2194 - val_loss: 2.0423
Epoch 2/100
144/144 - 19s - 132ms/step - accuracy: 0.1865 - loss: 1.9699 - val_accuracy: 0.3728 - val_loss: 1.9107
Epoch 3/100
144/144 - 19s - 133ms/step - accuracy: 0.2397 - loss: 1.8747 - val_accuracy: 0.3661 - val_loss: 1.8315
Epoch 4/100
144/144 - 20s - 137ms/step - accuracy: 0.3936 - loss: 1.6677 - val_accuracy: 0.5217 - val_loss: 1.4503
Epoch 5/100
144/144 - 20s - 136ms/step - accuracy: 0.5753 - loss: 1.2480 - val_accuracy: 0.6522 - val_loss: 1.0634
Epoch 6/100
144/144 - 19s - 132ms/step - accuracy: 0.6842 - loss: 0.9762 - val_accuracy: 0.7167 - val_loss: 0.9051
Epoch 7/100
144/144 - 19s - 132ms/step - accuracy: 0.7572 - loss: 0.8021 - val_accuracy: 0.7378 - val_loss: 0.8176
Epoch 8/100
144/144 - 19s - 131ms/step - accuracy: 0.8019 - loss: 0.6787 - val_accuracy: 0.7639 - val_loss: 0.7547
Epoch 9/100
144/144 - 20s - 136ms/step - accuracy: 0.8215 - loss: 0.6025 - val_a

In [35]:
sentence = ["i know i should feel flattered but it really makes me uncomfortable because i dont know how to react"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen = 100, padding = "post")

In [36]:
padded

array([[   1,    8,    1,    1,    2, 3662,    1,    1,    6,    1,    1,
         213,    1,    1,   28,    8,    1,    1, 1888,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]])

In [37]:
model.predict(padded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step


array([[1.3663160e-02, 9.8160523e-01, 2.3319713e-04, 1.0486227e-04,
        2.1774194e-03, 2.2161312e-03]], dtype=float32)