In [1]:
max_review_length = 300
max_words = 10000
embedding_dim = 100
no_reviews = 25492
training_samples = 10000
validation_samples = 5000
test_samples = 5492

In [2]:
import pandas as pd

data = pd.read_csv("tripadvisor_hotel_reviews.csv/tripadvisor_hotel_reviews.csv")
print (data)

reviews = data.Review
ratings = data.Rating

                                                  Review  Rating
0      nice hotel expensive parking got good deal sta...       4
1      ok nothing special charge diamond member hilto...       2
2      nice rooms not 4* experience hotel monaco seat...       3
3      unique, great stay, wonderful time hotel monac...       5
4      great stay great stay, went seahawk game aweso...       5
...                                                  ...     ...
20486  best kept secret 3rd time staying charm, not 5...       5
20487  great location price view hotel great quick pl...       4
20488  ok just looks nice modern outside, desk staff ...       2
20489  hotel theft ruined vacation hotel opened sept ...       1
20490  people talking, ca n't believe excellent ratin...       2

[20491 rows x 2 columns]


In [3]:
#print (ratings)
#print (reviews)


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens')

sequences = tokenizer.texts_to_sequences(reviews)

print("Line 1: ", reviews[1])
print("\nCoded: ", sequences[1])

Found 52211 unique tokens
Line 1:  ok nothing special charge diamond member hilton decided chain shot 20th anniversary seattle, start booked suite paid extra website description not, suite bedroom bathroom standard hotel room, took printed reservation desk showed said things like tv couch ect desk clerk told oh mixed suites description kimpton website sorry free breakfast, got kidding, embassy suits sitting room bathroom bedroom unlike kimpton calls suite, 5 day stay offer correct false advertising, send kimpton preferred guest website email asking failure provide suite advertised website reservation description furnished hard copy reservation printout website desk manager duty did not reply solution, send email trip guest survey did not follow email mail, guess tell concerned guest.the staff ranged indifferent not helpful, asked desk good breakfast spots neighborhood hood told no hotels, gee best breakfast spots seattle 1/2 block away convenient hotel does not know exist, arrived late

In [5]:
import numpy as np

data = pad_sequences(sequences, maxlen = max_review_length)

ratings_array = []
for i in ratings:
    if ratings[i] ==1:
        ratings_array.append([1,0,0,0,0])
    if ratings[i] ==2:
        ratings_array.append([0,1,0,0,0])
    if ratings[i] ==3:
        ratings_array.append([0,0,1,0,0])
    if ratings[i] ==4:
        ratings_array.append([0,0,0,1,0])
    if ratings[i] ==5:
        ratings_array.append([0,0,0,0,1])
        
print('Shape of data = ', data.shape)
print('Shape of ratings = ', ratings.shape)

#print(ratings)
ratings_array = np.array(ratings_array)

Shape of data =  (20491, 300)
Shape of ratings =  (20491,)


In [6]:
indices = np.arange(data.shape[0])

np.random.seed(9)
np.random.shuffle(indices)

data = data[indices]
ratings_array = ratings_array[indices]

x_train = data[:training_samples]
y_train = ratings_array[:training_samples]

x_val = data[training_samples:training_samples+validation_samples]
y_val = ratings_array[training_samples:training_samples+validation_samples]

x_test = data[training_samples+validation_samples: training_samples+validation_samples+test_samples]
y_test = ratings_array[training_samples+validation_samples: training_samples+validation_samples+test_samples]

#print(y_val.shape)

In [7]:
import os

glove_dir = 'glove'

embeddings_index = {}

f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print('no of words in glove embeddings =', len(embeddings_index))

no of words in glove embeddings = 400000


In [8]:
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i<max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
for word,i in word_index.items():
    if i>10: break
    print(f'{i}:{word}\t--> { embedding_matrix[i, 0:6]}')

1:hotel	--> [ 0.43044001 -0.71715999  0.13989     0.59311002 -0.16727     0.56128001]
2:room	--> [-0.024843    0.47766     0.32437    -0.054239   -0.47622001  1.10430002]
3:not	--> [-0.19103999  0.17601     0.36919999 -0.50322998 -0.47560999  0.15798   ]
4:great	--> [-0.013786    0.38216001  0.53236002  0.15261    -0.29694    -0.20558   ]
5:n't	--> [ 0.15730999  0.3953      0.63586003 -1.09749997 -0.95767999 -0.013841  ]
6:good	--> [-0.030769    0.11993     0.53908998 -0.43696001 -0.73936999 -0.15345   ]
7:staff	--> [-0.61250001 -0.29506999 -0.28917    -0.36431    -0.39695001  0.097624  ]
8:stay	--> [-0.41615999 -0.26538     0.21720999 -0.26014999 -0.18043999  0.38745001]
9:did	--> [ 0.30449    -0.19628     0.20225    -0.61686999 -0.68484002 -0.11887   ]
10:just	--> [ 0.075026    0.39324999  0.90314001 -0.30451    -0.32767999  0.59630001]


In [9]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

network_g = Sequential()
network_g.add(Embedding(max_words, embedding_dim, input_length =max_review_length, weights = [embedding_matrix], trainable = False))
network_g.add(SimpleRNN(128))
network_g.add(Dense(5, activation='softmax'))
network_g.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          1000000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               29312     
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 1,029,957
Trainable params: 29,957
Non-trainable params: 1,000,000
_________________________________________________________________


In [10]:
network_g.layers[0].trainable = True

network_g.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['acc'])

hist_g = network_g.fit(x_train, y_train, epochs = 5, batch_size = 32, validation_data = (x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
test = network_g.evaluate(x_test, y_test, steps = 500, verbose = 1)

