In [None]:
!pip install keras-tuner -q  # for Bayesian optimisation

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import requests, pickle
import re
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras_tuner as kt

In [None]:
from tensorflow import keras
from collections import Counter
from tensorflow.keras.metrics import RootMeanSquaredError
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Model, Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM, Input
from sklearn.metrics import confusion_matrix, classification_report

## Input
Data loading and classical text preprocessing: lowering, tokenisation, labels mapping; Splitting to train/test sets and building a vocabulary

In [None]:
df = pd.read_csv("https://frasca.di.unimi.it/MLDNN/input_data.csv")
df.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Hotel_number_reviews,Reviewer_number_reviews,Review_Score,Review,Review_Type
0,Scarsdale Scarsdale Place Kensington Kensingto...,5/2/2017,8.1,Copthorne Tara Hotel London Kensington,United Kingdom,7105,2,6.7,Expensive room rate that didn t include parki...,Bad_review
1,53 53 59 Kilburn High Road Maida Vale London C...,8/4/2016,7.1,BEST WESTERN Maitrise Hotel Maida Vale,United Kingdom,1877,8,5.8,Bedroom in the basement No windows Very small...,Bad_review
2,Pelai Pelai 28 Ciutat Vella 08002 Barcelona Spain,11/17/2016,8.6,Catalonia Ramblas 4 Sup,United Kingdom,4276,2,6.3,Room ready for a makeover Location,Bad_review
3,3 3 Place du G n ral Koenig 17th arr 75017 Par...,2/4/2016,7.1,Hyatt Regency Paris Etoile,United Kingdom,3973,3,5.8,Firstly the lady at the check in desk was qui...,Bad_review
4,Epping Epping Forest 30 Oak Hill London IG8 9N...,7/27/2016,7.5,Best Western PLUS Epping Forest,United Kingdom,587,7,3.3,Not being able to park my vehicle due to the ...,Bad_review


In [None]:
data_link = "https://frasca.di.unimi.it/MLDNN/input_data.pkl"
r = requests.get(data_link)
r.raise_for_status()
data = pickle.loads(r.content)

  data = pickle.loads(r.content)


In [None]:
data.columns

Index(['Hotel_Address', 'Review_Date', 'Average_Score', 'Hotel_Name',
       'Reviewer_Nationality', 'Hotel_number_reviews',
       'Reviewer_number_reviews', 'Review_Score', 'Review', 'Review_Type'],
      dtype='object')

In [None]:
df['Review'][17]

' We stayed in a room that was at the very end of the hotel this room was very very small from the door to the bed was 3 steps for me and im 5ft 2inch the bed The bed was very uncomfortable and gave both of us back ache there was no mattress topper which I was surprised has the mattress was very old when we took the sheet off im was just so shocked as when you walk in the hotel it looks amazing when you get to the room its a massive shock it was very disappointing The hotel lobby was beautiful it was very nice and welcoming '

In [None]:
def preprocessing(sentence):
  sentence = sentence.lower()
  sentence = re.compile('[^a-zA-Z]').sub(' ', sentence) # replace non-letters with a whitespace
  sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)  # remove single letters
  sentence = re.sub(r'\s+', ' ', sentence).strip()   # normalize whitespaces
  sentence = sentence.split()  # tokenize
  return sentence


In [None]:
df['cleaned_text'] = df['Review'].apply(preprocessing)

In [None]:
df['cleaned_text']

Unnamed: 0,cleaned_text
0,"[expensive, room, rate, that, didn, include, p..."
1,"[bedroom, in, the, basement, no, windows, very..."
2,"[room, ready, for, makeover, location]"
3,"[firstly, the, lady, at, the, check, in, desk,..."
4,"[not, being, able, to, park, my, vehicle, due,..."
...,...
13767,"[cleanliness, room, temperature, location, fac..."
13768,"[the, cost, of, the, parking, the, location]"
13769,"[the, staff, was, very, helpful]"
13770,"[tout, home, away, from, home]"


In [None]:
df['class']= df['Review_Type'].map({'Bad_review':0, 'Good_review':1}) # mapping review types to 2 corresponding integers

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, shuffle=True)

In [None]:
train_df['class']  # types in integer format

Unnamed: 0,class
2279,0
11812,1
12085,1
13580,1
1128,0
...,...
5695,0
8006,1
1361,0
1547,0


Additional text preprocessing only on train set: vocabulary building, padding etc.

In [None]:
train_df['cleaned_text']

Unnamed: 0,cleaned_text
2279,"[wifi, was, bad, customer, service, reception,..."
11812,"[everything, is, ghost, great]"
12085,"[price, is, too, much, place, is, good, room, ..."
13580,"[fantastic, place, with, awesome, staff]"
1128,"[ring, operator, or, housekeeping, no, one, pi..."
...,...
5695,"[they, couldn, give, me, french, bed, when, as..."
8006,"[staff, and, suite, were, great]"
1361,"[the, room, was, tiny, the, bed, was, quite, p..."
1547,"[breakfast, cold]"


Building a vocabulary for embedding

In [None]:
count = Counter(token for sequence in train_df['cleaned_text'] for token in sequence) # counting for most frequent words

In [None]:
vocab_sz= 10000 # limit the vocabulury size by top-10000 most frequent words
frequent = count.most_common(vocab_sz)

In [None]:
word2idx = {word: idx+2 for idx, (word, _) in enumerate(frequent)}  # assigning unique IDs for each of the words
word2idx['<PAD>'] = 0   # index for the space between words
word2idx['<UNK>'] = 1  # index for unknown words

In [None]:
print(f"Vocabulary size +2: {len(word2idx)}")
print("Top 10 frequent:", [w for w,_ in frequent[:10]])

Vocabulary size +2: 10002
Top 10 frequent: ['the', 'and', 'was', 'to', 'room', 'in', 'location', 'not', 'very', 'of']


In [None]:
max_len=400 # padding size

In [None]:
def encode_sequences(token_lists, mapping, maxlen):
    seqs = [[mapping.get(w, 1) for w in toks] for toks in token_lists]
    return pad_sequences(seqs,  maxlen=maxlen,
                         padding='post', truncating='post')

X_train = encode_sequences(train_df['cleaned_text'], word2idx, max_len)
X_test  = encode_sequences(test_df ['cleaned_text'], word2idx, max_len)

In [None]:
train_df.columns

Index(['Hotel_Address', 'Review_Date', 'Average_Score', 'Hotel_Name',
       'Reviewer_Nationality', 'Hotel_number_reviews',
       'Reviewer_number_reviews', 'Review_Score', 'Review', 'Review_Type',
       'cleaned_text', 'class'],
      dtype='object')

**CHANGE**: I turned 'Review_Score' into float32 type as Tensorflow expects this format and it accelerates the training speed

In [None]:
# preprocessing regression labels, turning them into float32, for more effective work with tensorflow
y_scores_train = train_df['Review_Score'].astype('float32').values
y_scores_test  = test_df['Review_Score'].astype('float32').values

##MODEL CONFIGURATION, OUTPUT, LOSS
Setting a model's architecture, losses and output layers; also training a model and performing a hyperparameters tuning by Bayesian optimisation

In [None]:
lr=0.001
embedding_dim = 128
max_len=400
vocab_size = len(word2idx)
inp = Input(shape=(max_len, ))
x = Embedding(output_dim=embedding_dim, input_dim=vocab_size, input_length=max_len)(inp)
x = LSTM(128, kernel_regularizer='l1', return_sequences=True)(x)
x = Dropout(0.3)(x)
x = LSTM(64)(x)
x= Dense(64, activation='relu')(x)
x= Dense(32, activation='relu')(x)
x= Dense(16, activation='relu')(x)
x1 = Dense(16, activation='relu')(x)

# my outputs layers
reg_res = Dense(1, activation='linear', name='reg_res')(x)
class_res = Dense(1, activation='sigmoid', name='class_res')(x1)

model = Model(inputs=inp, outputs=[reg_res, class_res])
optimizer = Adam(learning_rate=lr)

# setting different losses
model.compile(optimizer=optimizer, loss={'reg_res':'mse', 'class_res':'binary_crossentropy'}, metrics={'class_res':'accuracy','reg_res':[RootMeanSquaredError(name='rmse')]})




In [None]:
model.summary()

**CHANGE**: I added early stopping as my model was stuck in its accuracy after 20 epochs and there was no need to wait for all 50 epochs completion

In [None]:
epochs=10
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
history = model.fit(epochs=epochs, x=X_train, y={'reg_res':y_scores_train, 'class_res':train_df['class']}, validation_split=0.1, batch_size=64, callbacks=[early_stop])

Epoch 1/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 1s/step - class_res_accuracy: 0.5032 - class_res_loss: 0.6976 - loss: 33.4372 - reg_res_loss: 15.5088 - reg_res_rmse: 3.7388 - val_class_res_accuracy: 0.5073 - val_class_res_loss: 0.6928 - val_loss: 5.5617 - val_reg_res_loss: 4.4081 - val_reg_res_rmse: 2.1041
Epoch 2/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 1s/step - class_res_accuracy: 0.4968 - class_res_loss: 0.6942 - loss: 5.1750 - reg_res_loss: 4.2293 - reg_res_rmse: 2.0565 - val_class_res_accuracy: 0.4927 - val_class_res_loss: 0.6964 - val_loss: 5.2906 - val_reg_res_loss: 4.5239 - val_reg_res_rmse: 2.1250
Epoch 3/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 1s/step - class_res_accuracy: 0.5014 - class_res_loss: 0.6946 - loss: 5.0051 - reg_res_loss: 4.2319 - reg_res_rmse: 2.0571 - val_class_res_accuracy: 0.5073 - val_class_res_loss: 0.6929 - val_loss: 5.2052 - val_reg_res_loss: 4.4125 - val_reg_res_

Bayesian optimisation to find the best hyperparameters

In [None]:
def build_model(hp):
  embedding_dim = 128
  max_len=400
  vocab_sz = len(word2idx)
  lr=1e-4
  inp = Input(shape=(max_len, ))
  x = Embedding(output_dim=embedding_dim, input_dim=vocab_sz, input_length=max_len)(inp)

# tuning the number of neurons and weight regularizer choice in LSTM
  hp_units = hp.Int('lstm_neurons', min_value=32, max_value=256, step=32)
  hp_regularizer = hp.Choice('regularizer', values=['l1','l2'])
  x = LSTM(hp_units, kernel_regularizer=hp_regularizer)(x)
# tuning dropout rate
  dp_rates = hp.Float('dropout_rate', min_value=0.1, max_value=0.6, step=0.1)
  x = Dropout(dp_rates)(x)

  reg_res = Dense(1, activation='linear', name='reg_res')(x)
  class_res = Dense(1, activation='sigmoid', name='class_res')(x)

  model = Model(inputs=inp, outputs=[reg_res, class_res])
# tuning learning rate
  lr_choice = hp.Choice('learning rate', values=[1e-2, 1e-3, 5e-4, 1e-4])
  optimizer = Adam(learning_rate=lr_choice)
  model.compile(optimizer=optimizer, loss={'reg_res':'mae', 'class_res':'binary_crossentropy'}, metrics={'class_res':'accuracy','reg_res':'mae'})

  return model


In [None]:
tuner = kt.BayesianOptimization(build_model, objective='val_loss', max_trials=5, directory='bayesian', project_name='bayes_tuner')

In [None]:
tuner.search(X_train, {'reg_res':y_scores_train, 'class_res':train_df['class']}, batch_size=64, epochs=5, validation_split=0.1, callbacks=[early_stop])


Trial 1 Complete [00h 03m 17s]
val_loss: 2.5163142681121826

Best val_loss So Far: 2.5163142681121826
Total elapsed time: 00h 03m 17s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
256               |32                |lstm_neurons
l1                |l2                |regularizer
0.4               |0.5               |dropout_rate
0.01              |0.0005            |learning rate

Epoch 1/5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 2s/step - class_res_accuracy: 0.5082 - class_res_loss: 0.7326 - loss: 11.1937 - reg_res_loss: 2.1790 - reg_res_mae: 2.1790 - val_class_res_accuracy: 0.4927 - val_class_res_loss: 0.6945 - val_loss: 4.1748 - val_reg_res_loss: 1.8583 - val_reg_res_mae: 1.8507
Epoch 2/5
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 2s/step - class_res_accuracy: 0.5073 - class_res_loss: 0.7043 - loss: 4.2074 - reg_res_loss: 1.8718 - reg_res_mae: 1.8718 - val_class_res_accuracy: 0.5073 - val_class_r

In [None]:
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.get_best_models()[0]
print("Best units:",   best_hp.get('lstm_neurons'))
print("Best dropout:", best_hp.get('dropout_rate'))
print("Best lr:",      best_hp.get('learning rate'))

## MODEL EVALUATION
Evaluation the model based on accuracy for review_type output and RSME for review_score; plotting accuracy, RSME and losses evolution epoch-by-epoch

In [None]:
model.evaluate(X_test, {'reg_res': y_scores_test, 'class_res': test_df['class']})

In [None]:
pred_score, pred_class = model.predict(X_test)

In [None]:
# Regression RMSE
mse = mean_squared_error(y_scores_test ,pred_score)
rmse = np.sqrt(mse)
rmse

In [None]:
acc = accuracy_score(test_df['class'], pred_class.round())
acc

Learning curves for 2 outputs: Review_score and Review_type

In [None]:
# Accuracy and RMSE
train_mse = history.history['reg_res_loss']
train_rmse = np.sqrt(train_mse)
val_mse = history.history['val_reg_res_loss']
val_rmse = np.sqrt(val_mse)

plt.figure(figsize=(10, 4))
plt.plot(train_rmse,  label='train RMSE')
plt.plot(val_rmse,label='val RMSE')
plt.title('RMSE for regression')
plt.xlabel('Epochs'); plt.ylabel('RMSE')
plt.legend(); plt.show()

plt.figure(figsize=(10, 4))
plt.plot(history.history['class_res_accuracy'], label='train accuracy')
plt.plot(history.history['val_class_res_accuracy'],label='val accuracy')
plt.title('Classification accuracy')
plt.xlabel('Epochs'); plt.ylabel('Accuracy')
plt.legend(); plt.show()

For regression we see almost the constant value for its mae reached after a few epochs, for classification the accuracy is chaotic

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_rmse,  label='train RMSE')
plt.plot(val_rmse,label='val RMSE')
plt.title('Regression Loss (RMSE)')
plt.xlabel('Epochs'); plt.ylabel('RMSE')
plt.legend(); plt.show()

plt.figure(figsize=(10, 5))
plt.plot(history.history['class_res_loss'],   label='train BCE')
plt.plot(history.history['val_class_res_loss'],  label='val  BCE')
plt.title('Classification Loss')
plt.xlabel('Epochs'); plt.ylabel('BCE')
plt.legend(); plt.show()

So, for classification we see some loss fall, while for regression it reached its plato too soon