In [5]:
from preprocess import *
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load negative reviews and positive reviews from files
with open('data/negative-reviews.txt', 'r', encoding='latin-1') as file:
    negative_reviews = file.readlines()

with open('data/positive-reviews.txt', 'r', encoding='latin-1') as file:
    positive_reviews = file.readlines()

# Combine negative and positive reviews
all_reviews = negative_reviews + positive_reviews

# Create a DataFrame with text and sentiment columns
training_df = pd.DataFrame({'text': all_reviews, 'sentiment': [0] * len(negative_reviews) + [1] * len(positive_reviews)})

# Tokenize the text and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_df['text'])

X_train_sequences = tokenizer.texts_to_sequences(training_df['text'])
X_train_padded = pad_sequences(X_train_sequences)

y_train = training_df['sentiment']

# Split the data into training and test sets
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X_train_padded, y_train, test_size=0.2, random_state=42)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train_padded.shape[1]))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate the model
y_pred_proba = model.predict(X_test_padded)
y_pred = (y_pred_proba > 0.5).astype(int)

print(classification_report(y_test, y_pred))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4014
           1       0.94      0.94      0.94      3986

    accuracy                           0.94      8000
   macro avg       0.94      0.94      0.94      8000
weighted avg       0.94      0.94      0.94      8000



In [10]:
# predict unseen data
new_data = [
    'Have no fun',
    'good',
    'bad',
    'interesting',
    'blah blah',
    "The product was amazing! I loved it.",
    "Terrible experience, would not recommend.",
    "Great value for the price.",
    "Disappointed with the quality.",
    "This exceeded my expectations.",
    "Worst purchase ever!",
    "Highly recommended, will buy again.",
    "Not worth the money.",
    "The service was excellent.",
    "Very poor customer service."
]

# Tokenize and pad sequences for new data
new_data_sequences = tokenizer.texts_to_sequences(new_data)
new_data_padded = pad_sequences(new_data_sequences, maxlen=X_train_padded.shape[1])

# Make predictions using the RNN model
predictions = model.predict(new_data_padded)

# Display sentiment for each input
for i in range(len(new_data)):
    sentiment = "Positive" if predictions[i][0] >= 0.5 else "Negative"
    print(f'{new_data[i]}: {sentiment} Sentiment')


Have no fun: Negative Sentiment
good: Positive Sentiment
bad: Negative Sentiment
interesting: Positive Sentiment
blah blah: Negative Sentiment
The product was amazing! I loved it.: Positive Sentiment
Terrible experience, would not recommend.: Negative Sentiment
Great value for the price.: Positive Sentiment
Disappointed with the quality.: Negative Sentiment
This exceeded my expectations.: Negative Sentiment
Worst purchase ever!: Negative Sentiment
Highly recommended, will buy again.: Positive Sentiment
Not worth the money.: Negative Sentiment
The service was excellent.: Positive Sentiment
Very poor customer service.: Negative Sentiment
