In [20]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [21]:
df = pd.read_csv("dataset_scraping.csv")
df.head()
from textblob import TextBlob

def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['review'].apply(get_sentiment)

In [22]:
df.rename(columns={'content': 'review'}, inplace=True)
df.dropna(inplace=True)
df['review'] = df['review'].astype(str)


In [23]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['review'] = df['review'].apply(clean_text)


In [24]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

# Cek mapping
print(dict(zip(le.classes_, le.transform(le.classes_))))


{'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'],
    test_size=0.2,
    random_state=42
)


In [26]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')


In [27]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()




In [28]:
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 72ms/step - accuracy: 0.8556 - loss: 0.5955 - val_accuracy: 0.8750 - val_loss: 0.4195
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 88ms/step - accuracy: 0.8901 - loss: 0.4156 - val_accuracy: 0.8750 - val_loss: 0.4225
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step - accuracy: 0.8746 - loss: 0.4542 - val_accuracy: 0.8750 - val_loss: 0.4185
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - accuracy: 0.8806 - loss: 0.4335 - val_accuracy: 0.8750 - val_loss: 0.4223
Epoch 5/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 81ms/step - accuracy: 0.8942 - loss: 0.3946 - val_accuracy: 0.8750 - val_loss: 0.4185
Epoch 6/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 68ms/step - accuracy: 0.8853 - loss: 0.4078 - val_accuracy: 0.8750 - val_loss: 0.4129
Epoch 7/10
[1m60/60[0m [32m━━━━

In [29]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Akurasi testing:", accuracy)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8970 - loss: 0.3293
Akurasi testing: 0.903333306312561


In [30]:
model.save("model_sentiment.h5")
from google.colab import files
files.download("model_sentiment.h5")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>