# Importing libraries

In [1]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import tensorflow as tf


In [2]:
data = pd.read_csv('F:/hotel_review.csv')

In [3]:
data

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


# Processing Text

In [5]:
stop_words = stopwords.words('english')

In [6]:
def process_text(text):
    text = re.sub(r'\d+', ' ', text)
    text = text.split()
    text = " ".join([word for word in text if word.lower().strip() not in stop_words])
    return text

In [7]:
reviews = data['Review'].apply(process_text)

In [8]:
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)

In [9]:
max_seq_length = np.max(list(map(lambda x: len(x), sequences)))

print("Max sequence length:", max_seq_length)

Max sequence length: 1833


In [10]:
inputs = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [11]:
inputs

array([[   8,    1,  174, ...,    0,    0,    0],
       [ 139,  136,  262, ...,    0,    0,    0],
       [   8,    9,   76, ...,    0,    0,    0],
       ...,
       [ 139,  733,    8, ...,    0,    0,    0],
       [   1, 3785, 2479, ...,    0,    0,    0],
       [  27, 1156,  187, ...,    0,    0,    0]])

# Encoding Labels

In [12]:
data['Rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [13]:
#labels = np.array(data['Rating'].apply(lambda x: 1 if x == 5 else 0))
labels=np.array(data['Rating'])

In [14]:
labels

array([4, 2, 3, ..., 2, 1, 2], dtype=int64)

# Splitting

In [15]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, train_size=0.7, random_state=100)

# Model-02

In [16]:
embedding_dim = 16

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(16, return_sequences=True)
)(embedding)

flatten = tf.keras.layers.Flatten()(gru)

outputs = tf.keras.layers.Dense(1, activation='softmax')(flatten)


model = tf.keras.Model(inputs, outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1833)]            0         
                                                                 
 embedding (Embedding)       (None, 1833, 16)          160000    
                                                                 
 bidirectional (Bidirectiona  (None, 1833, 32)         3264      
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 58656)             0         
                                                                 
 dense (Dense)               (None, 1)                 58657     
                                                                 
Total params: 221,921
Trainable params: 221,921
Non-trainable params: 0
_______________________________________________________

In [17]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

history = model.fit(
    train_inputs,
    train_labels,
    validation_split=0.2,
    batch_size=80,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=2,
            restore_best_weights=True
        )
    ]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5


# Results

In [18]:
model.evaluate(test_inputs, test_labels)



[-35155.28125, 0.06636304408311844, 0.0]

## model03-Autokeras

In [21]:
!pip install autokeras

Collecting autokeras
  Using cached autokeras-1.0.19-py3-none-any.whl (162 kB)
Installing collected packages: autokeras
Successfully installed autokeras-1.0.19


In [19]:
import autokeras as ak

# Preparing the data for autokeras

In [20]:
split=round(len(data)*.70)
split

14344

In [55]:
x=data['Review']

In [56]:
y=data['Rating']

In [57]:
train_inputs,test_inputs=x[:split],x[split:]

In [58]:
train_labels,test_labels=y[:split],y[split:]

In [59]:
X_train_ak = np.array(train_inputs)
y_train_ak = np.array(train_labels)
X_test_ak = np.array(test_inputs)
y_test_ak = np.array(train_labels)

The training set is used to fit the models; the validation set is used to estimate prediction error for model selection; the test set is used for assessment of the generalization error of the final chosen model

In [61]:

keras = ak.TextRegressor(overwrite=True, max_trials=3) #TextRegressor with maximum trials of 3 i.e The AutoKeras will create a maximum of 3 prediction models  

keras.fit(X_train_ak, y_train_ak, epochs=10, validation_split=0.2)

Trial 3 Complete [00h 00m 46s]
val_loss: 1.5750421285629272

Best val_loss So Far: 0.6759951114654541
Total elapsed time: 00h 02m 53s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: .\text_regressor\best_model\assets


INFO:tensorflow:Assets written to: .\text_regressor\best_model\assets


<keras.callbacks.History at 0x1cc973735b0>

In [62]:
# Show the built models
keras_export = keras.export_model()
keras_export.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 expand_last_dim (ExpandLast  (None, 1)                0         
 Dim)                                                            
                                                                 
 text_vectorization (TextVec  (None, 64)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 64, 128)           640128    
                                                                 
 dropout (Dropout)           (None, 64, 128)           0         
                                                                 
 conv1d (Conv1D)             (None, 62, 32)            12320 

In [63]:
# Predict the test data
from itertools import chain
pred_keras = keras.predict(X_test_ak)
pred_keras = list(chain(*pred_keras))
pred_keras2 = [i if i <= 5 else 5 for i in pred_keras]
pred_keras2 = [i if i >= 1 else 1 for i in pred_keras2]
pred_keras2 = [round(i) for i in pred_keras2]



In [78]:
# Compute the RMSE
#rmse_keras = mean_squared_error(y_test_ak, pred_keras2)**0.5
#print('RMSE: ' + str(rmse_keras))

In [75]:
print('Confusion Matrix')
pd.DataFrame(confusion_matrix(test_labels, pred_keras2), index=[1,2,3,4,5], columns=[1,2,3,4,5])

Confusion Matrix


Unnamed: 0,1,2,3,4,5
1,183,87,34,13,0
2,151,161,110,50,1
3,49,137,252,182,7
4,11,115,408,1015,163
5,4,46,330,1771,867


## conclusion

the accuracy of the sequential model is 60% and for input model(another custom model) is 50% because in the 2nd model we had less amount of layer from that we can say that the sequential model is best for this type of data and if we increase the epoch and batch size we can increase the accuracy 

And the autokeras tells that the this is best layer for our data and our sequential model is more are less same thats why we can get better accuracy compare to 2nd model