In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, LSTM
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load dataset
df = pd.read_csv(r"data\data_trustpilot.csv") 
df.head()

Unnamed: 0,rating,location,username,number_reviews,verification,repeat_reviewer,repeat_reviewer_encoded,company,text,text_processed,...,date_posted,local_date_posted,month_local,local_hour,time_of_day,day_of_week_posted,day_type,days_between_experience_and_post,review_time,review_time_encoded
0,5,CA,Rob Crane,2,Redirected,repeat,1,Flashbay,The company rep I worked with made my transact...,company rep worked made transaction smooth qui...,...,2024-10-23 04:17:44,2024-10-22,10,21,Evening,1,Business Day,129,late_review,0
1,5,US,Pat Anderson,1,Verified,one-time,0,Flashbay,I highly recommend using Flashbay. Immediately...,highly recommend using flashbay immediately or...,...,2024-10-16 19:34:05,2024-10-16,10,12,Business Hours,2,Business Day,0,quick_review,1
2,5,CZ,Margarita Orlova,1,Verified,one-time,0,Flashbay,I had the pleasure of working with Shelby Gibs...,pleasure working shelby gibson large order nee...,...,2024-10-17 10:27:44,2024-10-17,10,10,Business Hours,3,Business Day,7,late_review,0
3,5,US,Paola Rivas,1,Verified,one-time,0,Flashbay,I had a fantastic experience with Brian Truong...,fantastic experience brian truong attentive tr...,...,2024-10-21 22:38:50,2024-10-21,10,15,Business Hours,0,Business Day,0,quick_review,1
4,5,CA,Fiona Mckelvey Keenan,3,Not Verified,repeat,1,Flashbay,My number-one go-to for computer accessories. ...,numberone goto computer accessories rachel sup...,...,2024-10-23 04:09:05,2024-10-22,10,21,Evening,1,Business Day,103,late_review,0


In [None]:
# Split into training and test sets
X = df['text_processed'].values
y = df['rating'].values

# Convert labels to categorical 
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have uniform length
max_length = max([len(x) for x in X_train_seq]) 
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')



In [18]:
# Preprocessing the text for Word2Vec
sentences = [text.split() for text in df['text_processed']]

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

# Extract word vectors for each word in the vocabulary
embedding_index = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}

In [19]:
# Create embedding matrix
embedding_dim = 100  # Dimension of Word2Vec embeddings
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for the padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in embedding_index:
        embedding_matrix[i] = embedding_index[word]

In [20]:
# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))  # LSTM layer
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=5, activation='softmax'))  # 5 classes for 5-star ratings

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

In [21]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Save the model
model.save("rnn_Word2Vec_rating.keras")

Epoch 1/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 534ms/step - accuracy: 0.4847 - loss: 1.3636 - val_accuracy: 0.4823 - val_loss: 1.3391
Epoch 2/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m848s[0m 526ms/step - accuracy: 0.4865 - loss: 1.3391 - val_accuracy: 0.4824 - val_loss: 1.3395
Epoch 3/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m852s[0m 529ms/step - accuracy: 0.4873 - loss: 1.3331 - val_accuracy: 0.4824 - val_loss: 1.3386
Epoch 4/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m858s[0m 533ms/step - accuracy: 0.4879 - loss: 1.3345 - val_accuracy: 0.4824 - val_loss: 1.3381
Epoch 5/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m908s[0m 564ms/step - accuracy: 0.4878 - loss: 1.3369 - val_accuracy: 0.4824 - val_loss: 1.3384
Epoch 6/10
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m934s[0m 580ms/step - accuracy: 0.4908 - loss: 1.3308 - val_accuracy: 0.4824 - val_loss:

In [24]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy: ", accuracy*100, "%")

[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 186ms/step - accuracy: 0.4783 - loss: 1.3387
Test Accuracy:  48.23785126209259 %
