In [90]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import tensorflow as tf
factory = StemmerFactory()
stemmer = factory.create_stemmer()
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [91]:
df = pd.read_csv('pungli.csv')
df['review'].dropna(inplace=True)
df['sentiment'].dropna(inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,View nya bagus cuma saya ada beberapa lapak ga...,1.0
1,"Cukup 1 kali aja ke sini, kapok ngga lagi ke s...",2.0
2,"sebenarnya tempatnya bagus, apalgi buat refres...",2.0
3,"gasuka banget, banyak pungli nya.\nmasuk ke gn...",2.0
4,"Udaranya sejuk, pemandanganya nyejukin mata bg...",1.0


In [92]:
#hapus karakter selain huruf dan angka
#hapus new line char dan double space
def secondClean(text):
    cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    cleaned = cleaned.replace('\n', ' ')
    cleaned = cleaned.replace ('  ', ' ')
    return cleaned
def lowercase(text):
    lowered = text.lower()
    return lowered
def stopwording(text):
    with open('combined2.txt', 'r') as f:
        stop_words = f.read().splitlines()
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return filtered_sentence
def lemma(text):
    return [stemmer.stem(word) for word in text]
# Fungsi pre-processing tambahan untuk mengatasi slang words
#def freqDist(text):
    #freq_dist = nltk.FreqDist(text)
    #return freq_dist

In [93]:
df['cleaned'] = df['review'].apply(secondClean)
df['cleaned'] = df['cleaned'].apply(lowercase)
df['cleaned'] = df['cleaned'].apply(stopwording)
df['cleaned'] = df['cleaned'].apply(lemma)
df['cleaned'].head()

0                                        [bagus, urus]
1                               [kapok, pungli, 10000]
2                   [bagus, refresing, pungli, pungli]
3    [gasuka, pungli, 20rb, bayar, 10rb, motor, 5rb...
4                             [sejuk, nyejukin, bagus]
Name: cleaned, dtype: object

In [94]:
df = df.dropna(subset=['sentiment']).replace([np.inf, -np.inf], np.nan).dropna(subset=['sentiment'])


In [95]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

# Siapkan data untuk dimasukkan ke model (features) dan data yang akan diprediksi (labels)
X = df.cleaned
y = df.sentiment
# Split data menjadi 70% untuk latih dan 30% untuk uji
max_words = 100
max_len = 20
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=70)
tokenizer.fit_on_texts(X_train)
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_sequence, maxlen=max_len)
X_test_pad = pad_sequences(X_test_sequence, maxlen=max_len)



In [96]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(256),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss= 'sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [100]:
model.fit(X_train_pad, y_train, epochs=50, batch_size=128, validation_data=(X_test_pad, y_test))
# Model evaluation
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Loss: 0.44388797879219055, Accuracy: 0.8605442047119141


In [105]:
sample_data = 'tempatnya agak kurang, banyak pungli dimana mana'
sample_data = secondClean(sample_data)
sample_data = lowercase(sample_data)
sample_data = stopwording(sample_data)
sample_data = lemma(sample_data)
sample_data_sequence = tokenizer.texts_to_sequences([sample_data])
sample_data_pad = pad_sequences(sample_data_sequence, maxlen=max_len)
prediction = model.predict(sample_data_pad)
predicted_class = np.argmax(prediction)
print(sample_data_sequence)
print(f'Predicted Sentiment: {predicted_class}')

[[5, 1]]
Predicted Sentiment: 2


In [82]:
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model = model
    model.save("modaljalan4.h5")

  saving_api.save_model(


In [80]:
# Convert the model.
from tensorflow import lite
converter = lite.TFLiteConverter.from_keras_model(model)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_new_converter=True
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS]
tfmodel = converter.convert()

# Save the model.
with open('model3.tflite', 'wb') as f:
  f.write(tfmodel)

In [106]:
import json

metadata = {
    "name": "my_model",
    "version": "1.0",
    "description": "Model for sentiment analysis",
    "architecture": [
        {"name": "Embedding", "units": 128, "input_length": 100},
        {"name": "Dense", "units": 128},
        {"name": "Dense", "units": 256},
        {"name": "SpatialDropout1D", "rate": 0.2},
        {"name": "LSTM", "units": 128, "dropout": 0.2, "recurrent_dropout": 0.2},
        {"name": "Dropout", "rate": 0.5},
        {"name": "Dense", "units": 3, "activation": "softmax"}
    ],
    "weights": "...",
    "configuration": {
        "learning_rate": 0.01,
        "optimizer": "Adam"
    },
    "data": "...",
    "creation_date": "2023-12-22",
    "creator": "Bard"
}

with open('model_metadata.json', 'w') as f:
    json.dump(metadata, f)
