In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from keras.optimizers import Adam
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, concatenate,Flatten
from keras.regularizers import l2

In [25]:

train = pd.read_excel('ClassificationDataset-train1.xlsx')
val = pd.read_excel('ClassificationDataset-valid1.xlsx')

In [26]:
train

Unnamed: 0,Negative Review,Positive Review
0,I am so angry that i made this post available ...,Only the park outside of the hotel was beauti...
1,No Negative,No real complaints the hotel was great great ...
2,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...
3,My room was dirty and I was afraid to walk ba...,Great location in nice surroundings the bar a...
4,You When I booked with your company on line yo...,Amazing location and building Romantic setting
...,...,...
6327,Roadworks made it difficult to access hotel b...,Helpful staff excellent breakfast
6328,Restaurant was little cramped,Staff friendly Lovely clean room Good facilities
6329,more comfortable seats in bar,very family friendly staff
6330,No Negative,The rooms was excellent Value for money


In [5]:
val

Unnamed: 0,Negative Review,Positive Review
0,No Negative,Whole experience added to our fantastic adven...
1,No Negative,Refer to comments regarding our stay from June...
2,Construction on streets,I have no idea what genius perks are Location...
3,noisy room was not dark in the night,No Positive
4,The first room they gave us was very small We...,Close to many of the London attractions
...,...,...
2662,No Negative,Really comfortable bed and a very quite room ...
2663,Very very small rooms A bit like sleeping in ...,When you first arrive the main reception and ...
2664,Asked for upgrade twin club got a double and a...,Near queens way tube
2665,Needs a smartening up of its appearance But t...,The photos in booking com give the impression...


In [6]:
train_combined = pd.DataFrame(columns=['Review', 'Label'])
train_combined['Review'] = pd.concat([train['Negative Review'], train['Positive Review']], ignore_index=True)
num_negative_reviews = len(train['Negative Review'])
train_combined['Label'] = [0] * num_negative_reviews + [1] * (len(train_combined) - num_negative_reviews)
train_combined = train_combined.sample(frac=1, random_state=42).reset_index(drop=True)
train_combined

Unnamed: 0,Review,Label
0,Nothing,1
1,Great location beautiful building,1
2,The beds were really comfortable and clean Th...,1
3,We were upgraded to a room with an amazing vi...,1
4,No Negative,0
...,...,...
12659,Location,1
12660,No Negative,0
12661,A significant drawback was the sliding glass ...,0
12662,The room is bit small anyway it s London,0


In [7]:
val_combined = pd.DataFrame(columns=['Review', 'Label'])
val_combined['Review'] = pd.concat([val['Negative Review'], val['Positive Review']], ignore_index=True)
num_negative_reviews = len(val['Negative Review'])
val_combined['Label'] = [0] * num_negative_reviews + [1] * (len(val_combined) - num_negative_reviews)
val_combined = val_combined.sample(frac=1, random_state=42).reset_index(drop=True)
val_combined

Unnamed: 0,Review,Label
0,Nothing,0
1,Near UGround,1
2,location and staff,1
3,We didn t like having to leave,0
4,No access to fresh or window in the room,0
...,...,...
5329,No Positive,1
5330,courteous staff and convenient location,1
5331,The location was close to eating places and p...,1
5332,Location,1


In [8]:
max_words = 10000
max_len = 100
embedding_dim = 200  # Correct the embedding dimension to match GloVe embeddings
num_classes = len(np.unique(train_combined['Label']))


In [9]:
le = LabelEncoder()
le.fit(train_combined['Label'])
train_labels_encoded = le.transform(train_combined['Label'])
val_labels_encoded = le.transform(val_combined['Label'])
train_labels_onehot = np.eye(num_classes)[train_labels_encoded]
val_labels_onehot = np.eye(num_classes)[val_labels_encoded]

In [10]:
le.transform(train_combined['Label']).shape

(12664,)

In [11]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_combined['Review'])
train_sequences = tokenizer.texts_to_sequences(train_combined['Review'])
val_sequences = tokenizer.texts_to_sequences(val_combined['Review'])
train_data_pad = pad_sequences(train_sequences, maxlen=max_len)
val_data_pad = pad_sequences(val_sequences, maxlen=max_len)


In [13]:
embedding_index = {}
with open('glove.twitter.27B.200d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coef

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [14]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)) 
#model.add(LSTM(64,return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(16,return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          2000000   
                                                                 
 lstm (LSTM)                 (None, 100, 128)          168448    
                                                                 
 lstm_1 (LSTM)               (None, 16)                9280      
                                                                 
 dense (Dense)               (None, 2)                 34        
                                                                 
Total params: 2,177,762
Trainable params: 177,762
Non-trainable params: 2,000,000
_________________________________________________________________


In [15]:
adam_optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])


In [16]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [17]:
model.fit(train_data_pad, train_labels_onehot, epochs=50, batch_size=64,
          validation_data=(val_data_pad, val_labels_onehot),
          callbacks=[early_stopping])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


<keras.callbacks.History at 0x7fde443a8d90>

In [21]:
val_predictions = model.predict(val_data_pad)
val_predictions_classes = np.argmax(val_predictions, axis=1)
f1_scores = f1_score(val_labels_encoded, val_predictions_classes, average='micro')
f1_scores



0.9326959130108736

In [27]:
model.save('dataset_1.h5')

In [25]:
test_predictions = model.predict(test_data_pad)
test_predictions_classes = np.argmax(test_predictions, axis=1)
test_predictions_classes



array([1])