In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from keras.optimizers import Adam

2023-08-31 19:42:43.781386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train = pd.read_excel('ClassificationDataset-train2.xlsx')
val = pd.read_excel('ClassificationDataset-valid2.xlsx')

In [3]:
train.dropna(inplace=True)
val.dropna(inplace=True)

In [4]:
max_words = 10000
max_len = 100
embedding_dim = 200  # Correct the embedding dimension to match GloVe embeddings
num_classes = len(np.unique(train['ClassLabel']))
num_classes

5

In [5]:

le = LabelEncoder()
le.fit(train['ClassLabel'])

train_labels_encoded = le.transform(train['ClassLabel'])
val_labels_encoded = le.transform(val['ClassLabel'])
train_labels_onehot = np.eye(num_classes)[train_labels_encoded]
val_labels_onehot = np.eye(num_classes)[val_labels_encoded]


In [6]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train['ReviewText'])
train_sequences = tokenizer.texts_to_sequences(train['ReviewText'])
val_sequences = tokenizer.texts_to_sequences(val['ReviewText'])
train_data_pad = pad_sequences(train_sequences, maxlen=max_len)
val_data_pad = pad_sequences(val_sequences, maxlen=max_len)


In [7]:
embedding_index = {}
with open('glove.twitter.27B.200d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coef

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [8]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)) 
#model.add(LSTM(64,return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(16,return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          2000000   
                                                                 
 lstm (LSTM)                 (None, 100, 128)          168448    
                                                                 
 lstm_1 (LSTM)               (None, 16)                9280      
                                                                 
 dense (Dense)               (None, 5)                 85        
                                                                 
Total params: 2,177,813
Trainable params: 177,813
Non-trainable params: 2,000,000
_________________________________________________________________


In [10]:
adam_optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=adam_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


In [11]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)


In [12]:
model.fit(train_data_pad, train_labels_onehot, epochs=50, batch_size=64,
          validation_data=(val_data_pad, val_labels_onehot),
           callbacks=[early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.callbacks.History at 0x7fb584795a20>

In [13]:
val_predictions = model.predict(val_data_pad)
val_predictions_classes = np.argmax(val_predictions, axis=1)
f1_scores = f1_score(val_labels_encoded, val_predictions_classes, average='micro')
f1_scores



0.8143213988343048

In [14]:
model.save('dataset_2.h5')