In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Embedding
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df = pd.read_csv("../data/processed/preprocessed_reviews.csv")
print("Data shape:", df.shape)

# label ground truth with 1 if 4 stars or more, 0 if 2 stars or less
# 1 positive, 0 negative
df['label'] = df['score'].apply(lambda x: 1 if x >= 4 else 0)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)  # or whatever vocab size you want
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])

# replace out-of-vocab indices with 0 so we can pad
sequences = [[token if token < 3000 else 0 for token in seq] for seq in sequences]

# pad sequences to the same length
X = pad_sequences(sequences, maxlen=300)  # or choose a suitable maxlen
y = df['label'].values

# split into 80:20 train:test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

df.head()

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

history = model.fit(X_train, y_train, 
                    epochs=5, verbose=1, 
                    validation_split=0.1)


Data shape: (10381, 37)


Epoch 1/5
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 456ms/step - accuracy: 0.6582 - loss: 0.6093 - val_accuracy: 0.7870 - val_loss: 0.4485
Epoch 2/5
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 477ms/step - accuracy: 0.8587 - loss: 0.3516 - val_accuracy: 0.8676 - val_loss: 0.3321
Epoch 3/5
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 455ms/step - accuracy: 0.8998 - loss: 0.2730 - val_accuracy: 0.8724 - val_loss: 0.3237
Epoch 4/5
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 478ms/step - accuracy: 0.9124 - loss: 0.2447 - val_accuracy: 0.8676 - val_loss: 0.3368
Epoch 5/5
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 480ms/step - accuracy: 0.9230 - loss: 0.2173 - val_accuracy: 0.8640 - val_loss: 0.3601


In [None]:
import random
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
binary_predictions = (y_pred > 0.5).astype(int)

# # convert y_test to a numpy array
# y_test_array = np.array(y_test)

# # get 10 random indices
# random_indices = random.sample(range(len(y_test_array)), 5)

# # loop through random indices
# for idx in random_indices:
#     original_idx = idx_test[idx]
#     review_text = df.loc[original_idx, 'content']

#     print(f"Review: {review_text[:200]}...")  # print first 200 chars of review
#     print(f"True Label: {y_test_array[idx]}")  # print true sentiment of review (0 for negative, 1 for positive)
#     print(f"Predicted Label: {binary_predictions[idx]}")  # print predicted sentiment of review (0 for negative, 1 for positive)
#     print("------------------------------------------------------------------")

print("Accuracy:", accuracy_score(y_test, binary_predictions))


[1m36/65[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m2s[0m 71ms/step