In [None]:
# Untuk manipulasi data
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Untuk nlp
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
pwd = os.getcwd()
df = pd.read_csv('Dataset_Capstone_final.csv')
df

Unnamed: 0,Text,Mood
0,i woke up today feeling pissed off,anger
1,her mother would be cross and he would feel stu,anger
2,i was expecting to say this is a very bittersw...,anger
3,i was in i could feel him and i hated the draw...,anger
4,a study visit to a chicken factory the butchery,anger
...,...,...
13995,attended book club meeting discussed latest re...,neutral
13996,spent time organizing closet morning,neutral
13997,today s atmosphere exudes quiet serenity neith...,neutral
13998,moved day found existing state quiet contentme...,neutral


In [None]:
df.shape

(14000, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    14000 non-null  object
 1   Mood    14000 non-null  object
dtypes: object(2)
memory usage: 218.9+ KB


In [None]:
pd.DataFrame(df.isnull().sum(), columns = ["count"])

Unnamed: 0,count
Text,0
Mood,0


In [None]:
df.duplicated().sum()

0

In [None]:
df[df.duplicated()]

Unnamed: 0,Text,Mood


In [None]:
# Dictionary untuk memetakan label mood ke angka
mood_to_number = {
    'anger': 0,
    'fear': 1,
    'happy': 2,
    'joy': 3,
    'love': 4,
    'sadness': 5,
    'neutral': 6,
}

# Mengganti label mood menjadi angka
df['Mood'] = df['Mood'].replace(mood_to_number)

# Menampilkan DataFrame yang telah dimodifikasi
df

Unnamed: 0,Text,Mood
0,i woke up today feeling pissed off,0
1,her mother would be cross and he would feel stu,0
2,i was expecting to say this is a very bittersw...,0
3,i was in i could feel him and i hated the draw...,0
4,a study visit to a chicken factory the butchery,0
...,...,...
13995,attended book club meeting discussed latest re...,6
13996,spent time organizing closet morning,6
13997,today s atmosphere exudes quiet serenity neith...,6
13998,moved day found existing state quiet contentme...,6


In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [None]:
stop_words = set(stopwords.words("english"))

def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)

def remove_stop_words(text):

    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

def Removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text

def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)

def Removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def Removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F912"             # tambahkan emoji 🤒
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def normalize_text(df):
    df.Text=df.Text.apply(lambda text : lower_case(text))
    df.Text=df.Text.apply(lambda text : remove_stop_words(text))
    df.Text=df.Text.apply(lambda text : Removing_numbers(text))
    df.Text=df.Text.apply(lambda text : Removing_punctuations(text))
    df.Text=df.Text.apply(lambda text : Removing_urls(text))
    df.Text=df.Text.apply(lambda text : lemmatization(text))
    df.Text=df.Text.apply(lambda text : remove_emojis(text))
    return df

def normalized_sentence(sentence):
    sentence= lower_case(sentence)
    sentence= remove_stop_words(sentence)
    sentence= Removing_numbers(sentence)
    sentence= Removing_punctuations(sentence)
    sentence= Removing_urls(sentence)
    sentence= lemmatization(sentence)
    sentence= remove_emojis(sentence)
    return sentence



In [None]:
df= normalize_text(df)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
# Membagi data menjadi train dan validation
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(df["Text"], df["Mood"], test_size = 0.2, stratify = df["Mood"])

# Melakukan fitting pada Tokenizer untuk mendapatkan word_index
tokenizer = Tokenizer(num_words = 25000, oov_token = "<OOV>")
tokenizer.fit_on_texts(pd.concat([training_sentences, testing_sentences], axis=0))
word_index = tokenizer.word_index

# Melakukan sequencing untuk menerjemahkan kedalam word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Melakukan padding untuk menyeragamkan ukuran input ke neural network
training_padded = pad_sequences(training_sequences, maxlen = 100, padding = "post")
testing_padded = pad_sequences(testing_sequences, maxlen = 100, padding = "post")

In [None]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras import regularizers

In [None]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.83 and logs.get('val_accuracy')>0.83):
            print("\n validation_accuracy already more than 83%")
            self.model.stop_training = True

In [None]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=25000,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=100,
                                     name="embedding_5"),
    tf.keras.layers.Conv1D(filters=256, kernel_size=1, activation="relu"),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(7, activation="softmax")])

# Compile Conv1D model
model.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])



In [None]:
# Fit the model
callback = myCallback()
model_history = model.fit(training_padded,
                              training_labels,
                              epochs=5,
                              validation_data=(testing_padded, testing_labels),
                              callbacks=[callback])

Epoch 1/5
Epoch 2/5
 validation_accuracy already more than 83%


In [None]:
sentence = ["i know i should feel flattered but it really makes me uncomfortable because i dont know how to react"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen = 100, padding = "post")

In [None]:
padded

array([[   1,   10,    1,    1,    2, 7456,    1,    1,   11,    1,    1,
         410,    1,    1,   45,   10,    1,    1, 6200,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [None]:
model.predict(padded)



array([[8.6112423e-03, 9.1371292e-01, 1.1479429e-02, 2.5566272e-02,
        1.8873122e-02, 2.0877764e-02, 8.7937090e-04]], dtype=float32)

In [None]:
model.save("model_final.h5")

  saving_api.save_model(


In [None]:
export_dir = '\content'
tf.saved_model.save(model, export_dir)

In [None]:
mode = "Speed"

if mode == 'Storage':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_SIZE
elif mode == 'Speed':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_LATENCY
else:
    optimization = tf.lite.Optimize.DEFAULT


In [None]:
model_final = tf.keras.models.load_model('model_final.h5')

In [None]:
#model = tf.keras.models.load_model('verify_model_onegig.h5')
# Convert the model
converter = tf.lite.TFLiteConverter.from_keras_model(model_final) # path to the SavedModel directory
converter.optimizations = [optimization]
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
tflite_model = converter.convert()


