# Percobaan 3 (Word2vec NN(LSTM))

---



In [1]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
!pip install tensorflow
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [13]:
import pandas as pd
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('/content/purbaya_purbaya_labeled.csv')

In [6]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
factory_stop = StopWordRemoverFactory()
stop_words = factory_stop.get_stop_words()

# 2. Fungsi Preprocessing
def preprocess_teks(teks):
    if not isinstance(teks, str): # Cek agar tidak error jika data kosong
        return ""

    teks = teks.lower()
    teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)
    tokens = teks.split()

    # Filtering & Stemming
    final_tokens = []
    for token in tokens:
        # Sekarang 'stop_words' adalah list, jadi 'not in' akan bekerja
        if token not in stop_words:
            stemmed_token = stemmer.stem(token)
            final_tokens.append(stemmed_token)

    return ' '.join(final_tokens)

df['clean_teks'] = df['normalized_text'].apply(preprocess_teks)

In [9]:
le = LabelEncoder()
y = le.fit_transform(df['label_manual'])
print("Mapping Label:", dict(zip(le.classes_, le.transform(le.classes_))))

Mapping Label: {'negatif': np.int64(0), 'netral': np.int64(1), 'positif': np.int64(2)}


In [10]:
sentences = [text.split() for text in df['clean_teks']]
embedding_dim = 100
w2v_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)
print("Word2Vec Training Selesai.")

Word2Vec Training Selesai.


In [11]:
max_features = 5000 # Jumlah kosakata maksimal
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['clean_teks'].values)

X = tokenizer.texts_to_sequences(df['clean_teks'].values)
X = pad_sequences(X) # Samakan panjang kalimat (padding)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocab Size: {vocab_size}, Panjang Sequence: {X.shape[1]}")

Vocab Size: 1471, Panjang Sequence: 34


In [14]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        # Jika kata tidak ada di Word2Vec, berikan vektor random/nol
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=X.shape[1], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [21]:
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 85ms/step - accuracy: 0.7563 - loss: 0.6302 - val_accuracy: 0.7867 - val_loss: 0.6547
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.7739 - loss: 0.6189 - val_accuracy: 0.7867 - val_loss: 0.6531
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.7356 - loss: 0.6815 - val_accuracy: 0.7867 - val_loss: 0.6445
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step - accuracy: 0.7254 - loss: 0.7043 - val_accuracy: 0.7867 - val_loss: 0.6492
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 0.7737 - loss: 0.6104 - val_accuracy: 0.7867 - val_loss: 0.6472
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 0.7476 - loss: 0.6823 - val_accuracy: 0.7867 - val_loss: 0.6705
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━

In [22]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

print("\n--- Laporan Klasifikasi LSTM ---")
print(classification_report(y_test, y_pred, target_names=le.classes_))

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 548ms/step

--- Laporan Klasifikasi LSTM ---
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00         6
      netral       0.83      0.98      0.90        59
     positif       0.80      0.40      0.53        10

    accuracy                           0.83        75
   macro avg       0.54      0.46      0.48        75
weighted avg       0.76      0.83      0.78        75



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Model LSTM: {accuracy * 100:.2f}%")

Akurasi Model LSTM: 82.67%
