# Import Libraries

In [1]:
# !pip install Sastrawi
# !pip install scikeras

In [2]:
# Import some libraries

import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
seed = 0
np.random.seed(seed)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'whitegrid')

import nest_asyncio
nest_asyncio.apply()

import datetime as dt
import re
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# from wordcloud import WordCloud

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load Data

In [3]:
reviews_data = pd.read_csv('indonlu_smsa.csv')
reviews = reviews_data[['text', 'sentiment']]
reviews

Unnamed: 0,text,sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
12255,"film tncfu , tidak cocok untuk penonton yang t...",negative
12256,"indihome ini mahal loh bayar nya . hanya , pen...",negative
12257,"be de gea , cowok cupu yang takut dengan pacar...",negative
12258,valen yang sangat tidak berkualitas . konentat...,negative


In [4]:
sentiment_distribution = reviews['sentiment'].value_counts()
print("\Sentiment Distribution Count:")
print(sentiment_distribution)

\Sentiment Distribution Count:
sentiment
positive    7151
negative    3830
neutral     1279
Name: count, dtype: int64


# Data Preprocessing

In [6]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
    text = re.sub(r'RT[\s]', '', text) # remove RT
    text = re.sub(r"http\S+", '', text) # remove link
    text = re.sub(r'[0-9]+', '', text) # remove numbers

    text = text.replace('\n', ' ') # replace new line into space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower()
    return text

def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
    text = word_tokenize(text)
    return text

def filteringText(text): # Remove stopwords in a text
    listStopwords = set(stopwords.words('indonesian'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = [stemmer.stem(word) for word in text]
    return text

def toSentence(list_words): # Convert list of words into sentence
    sentence = ' '.join(word for word in list_words)
    return sentence

In [None]:
reviews['clean_text'] = reviews['text'].apply(cleaningText)
reviews['clean_text'] = reviews['clean_text'].apply(casefoldingText)
reviews.drop(['text'], axis = 1, inplace = True)

reviews['text_preprocessed'] = reviews['clean_text'].apply(tokenizingText)
reviews['text_preprocessed'] = reviews['text_preprocessed'].apply(filteringText)
reviews['text_preprocessed'] = reviews['text_preprocessed'].apply(stemmingText)

# drop duplicates/spams reviews
reviews.drop_duplicates(subset = 'clean_text', inplace = True)

# Export to csv file
reviews.to_csv(r'indonlu_data_clean.csv', index = False, header = True,index_label=None)

reviews

Unnamed: 0,sentiment,clean_text,text_preprocessed
0,positive,warung ini dimiliki oleh pengusaha pabrik tahu...,"[warung, milik, usaha, pabrik, puluh, kenal, p..."
1,neutral,mohon ulama lurus dan k mmbri hujjah partai ap...,"[mohon, ulama, lurus, k, mmbri, hujjah, partai..."
2,positive,lokasi strategis di jalan sumatera bandung te...,"[lokasi, strategis, jalan, sumatera, bandung, ..."
3,positive,betapa bahagia nya diri ini saat unboxing pake...,"[betapa, bahagia, nya, unboxing, paket, barang..."
4,negative,duh jadi mahasiswa jangan sombong dong kasih...,"[duh, mahasiswa, sombong, kasih, kartu, kuning..."
...,...,...,...
12255,negative,film tncfu tidak cocok untuk penonton yang ti...,"[film, tncfu, cocok, tonton, suka, sadis]"
12256,negative,indihome ini mahal loh bayar nya hanya penan...,"[indihome, mahal, loh, bayar, nya, tangan, nya..."
12257,negative,be de gea cowok cupu yang takut dengan pacar ...,"[be, de, gea, cowok, cupu, takut, pacar, nya, ..."
12258,negative,valen yang sangat tidak berkualitas konentato...,"[valen, kualitas, konentator, nya, didik, jebr..."


In [7]:
reviews = pd.read_csv('indonlu_data_clean.csv')

for i, text in enumerate(reviews['text_preprocessed']):
    reviews['text_preprocessed'][i] = reviews['text_preprocessed'][i].replace("'", "")\
                                            .replace(',','').replace(']','').replace('[','')
    list_words=[]
    for word in reviews['text_preprocessed'][i].split():
        list_words.append(word)

    reviews['text_preprocessed'][i] = list_words

reviews

Unnamed: 0,sentiment,clean_text,text_preprocessed
0,positive,warung ini dimiliki oleh pengusaha pabrik tahu...,"[warung, milik, usaha, pabrik, puluh, kenal, p..."
1,neutral,mohon ulama lurus dan k mmbri hujjah partai ap...,"[mohon, ulama, lurus, k, mmbri, hujjah, partai..."
2,positive,lokasi strategis di jalan sumatera bandung te...,"[lokasi, strategis, jalan, sumatera, bandung, ..."
3,positive,betapa bahagia nya diri ini saat unboxing pake...,"[betapa, bahagia, nya, unboxing, paket, barang..."
4,negative,duh jadi mahasiswa jangan sombong dong kasih...,"[duh, mahasiswa, sombong, kasih, kartu, kuning..."
...,...,...,...
12169,negative,film tncfu tidak cocok untuk penonton yang ti...,"[film, tncfu, cocok, tonton, suka, sadis]"
12170,negative,indihome ini mahal loh bayar nya hanya penan...,"[indihome, mahal, loh, bayar, nya, tangan, nya..."
12171,negative,be de gea cowok cupu yang takut dengan pacar ...,"[be, de, gea, cowok, cupu, takut, pacar, nya, ..."
12172,negative,valen yang sangat tidak berkualitas konentato...,"[valen, kualitas, konentator, nya, didik, jebr..."


In [None]:
# reviews['text_as_string'] = reviews['text_preprocessed'].apply(lambda x: ' '.join(x))
# reviews.to_csv(r'indonlu_data_clean.csv', index = False, header = True,index_label=None)

# Class Balancing by RandomOverSampler

In [20]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(reviews['text_as_string']).reshape(-1, 1), np.array(reviews['sentiment']).reshape(-1, 1));
reviews = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_as_string', 'sentiment']);

In [27]:
sentiment_distribution = reviews['sentiment'].value_counts()
print("\Sentiment Distribution Count:")
print(sentiment_distribution)

\Sentiment Distribution Count:
sentiment
positive    7109
neutral     7109
negative    7109
Name: count, dtype: int64


# Preprocessing Text Data

In [8]:
def toSentence(list_words): # Convert list of words into sentence
    sentence = ' '.join(word for word in list_words)
    return sentence

In [30]:
# Make text preprocessed (tokenized) to untokenized with toSentence Function
X = reviews['text_as_string']
max_features = 5000

# Tokenize text with specific maximum number of words to keep, based on word frequency
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X)
X.shape

(21327, 58)

In [31]:
polarity_encode = {'negative' : 0, 'neutral' : 1, 'positive' : 2}
y = reviews['sentiment'].map(polarity_encode).values

# Split the data (with composition data train 80%, data test 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(17061, 58) (17061,)
(4266, 58) (4266,)


# LSTM Model

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam, RMSprop

# Split your data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

def create_model():
    model = Sequential([
        Embedding(input_dim=max_features, output_dim=16, input_length=X_train.shape[1]),
        LSTM(units=32, dropout=0.2),
        Dense(units=3, activation='softmax')
    ])

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )

    return model

# Create the model
model = create_model()

# Define the callback for early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Train the model
history = model.fit(X_train, y_train, epochs=500, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Validation Loss: 0.25727906823158264
Validation Accuracy: 0.9182537198066711


In [37]:
import pickle

# Save the tokenizer for later use
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Predict the sentiment of a new text
new_text = "kurang bagus kalo menurut saya"
tokenizer = None
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

new_sequence = tokenizer.texts_to_sequences([new_text])
new_padded_sequence = pad_sequences(new_sequence, maxlen=58)
prediction = model.predict(new_padded_sequence)
predicted_class = np.argmax(prediction, axis=1)

# Assuming the sentiment classes are 0: negative, 1: neutral, 2: positive
sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_sentiment = sentiment_labels[predicted_class[0]]

print(f"The sentiment of the text '{new_text}' is: {predicted_sentiment}")

The sentiment of the text 'kurang bagus kalo menurut saya' is: negative


# Save Model

In [38]:
import tensorflow as tf
import pickle

# Save the model as an H5 file
model.save('model.h5')

# Save the model as a PKL file
# Save the model architecture as a JSON string
model_json = model.to_json()
with open('model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model_weights.h5')

# Save the model as a pickle file
with open('model.pkl', 'wb') as pkl_file:
    pickle.dump({'model_architecture': model_json, 'model_weights': model.get_weights()}, pkl_file)


  saving_api.save_model(


# Dense Layer

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras.optimizers import Adam

max_features = 10000  # Example max_features, replace with the actual number of features

def create_model():
    model = Sequential([
        Embedding(input_dim=max_features, output_dim=16, input_length=X_train.shape[1]),
        GlobalAveragePooling1D(),
        Dense(units=64, activation='relu'),
        Dropout(0.5),
        Dense(units=32, activation='relu'),
        Dropout(0.5),
        Dense(units=3, activation='softmax')
    ])

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )

    return model

# Create the model
model = create_model()

# Define the callback for early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Train the model
history = model.fit(X_train, y_train, epochs=500, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Save the model as an H5 file
model.save('sentiment_analysis_model.h5')

# Save the model architecture and weights as a PKL file
import pickle

# Save the model architecture as a JSON string
model_json = model.to_json()
with open('model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model_weights.h5')

# Save the model as a pickle file
with open('model.pkl', 'wb') as pkl_file:
    pickle.dump({'model_architecture': model_json, 'model_weights': model.get_weights()}, pkl_file)


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Validation Loss: 0.4920869767665863
Validation Accuracy: 0.8449692130088806


In [None]:
import pickle

# Save the tokenizer for later use
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Predict the sentiment of a new text
new_text = "tampilannya masih jelek"
tokenizer = None
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

new_sequence = tokenizer.texts_to_sequences([new_text])
new_padded_sequence = pad_sequences(new_sequence, maxlen=61)
prediction = model.predict(new_padded_sequence)
predicted_class = np.argmax(prediction, axis=1)

# Assuming the sentiment classes are 0: negative, 1: neutral, 2: positive
sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_sentiment = sentiment_labels[predicted_class[0]]

print(f"The sentiment of the text '{new_text}' is: {predicted_sentiment}")

The sentiment of the text 'tampilannya masih jelek' is: negative
