In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import nltk
import os
import warnings
from nltk.corpus import stopwords
import keras_tuner as kt
import tensorflow as tf

nltk.download('wordnet')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

path = os.path.join(os.getcwd(), 'DATA/')
path_fn= path + 'rt-polarity.neg'
with open(path_fn, "r",encoding='utf-8', errors='ignore') as f:
    content_fn = f.read()  
texts_neg = content_fn.splitlines()
path_fp = path + 'rt-polarity.pos'
with open(path_fp, "r",encoding='utf-8', errors='ignore') as f:
    content_fp = f.read()  
texts_pos = content_fp.splitlines()
df = pd.DataFrame(texts_neg + texts_pos, columns=['Text'])
df['Target'] = 0
df['Target'][5332:] = 1

stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
    
df['Text'] = df['Text'].apply(lemmatize_text)

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Target'], stratify = df['Target'], random_state=42)

vocab_size = 1000
embedding_dim = 100
max_length = 50
padding_type='post'
trunc_type='post'

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

def model_builder(hp):
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        keras.layers.Bidirectional(keras.layers.LSTM(64)),
        keras.layers.Dense(24, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(loss='binary_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  metrics=['accuracy'])
    return model

num_epochs = 50
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=50,
                     factor=5,
                     directory='my_dir',
                     project_name='intro_to_kt')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

tuner.search(train_padded, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])


best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(best_hps)

[nltk_data] Downloading package wordnet to /home/sv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Reloading Tuner from my_dir/intro_to_kt/tuner0.json
<keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7f9c90ef0cd0>


In [12]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_padded, y_train, epochs=50, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 3


In [13]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(train_padded, y_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f9ca2523cd0>

In [15]:
eval_result = hypermodel.evaluate(test_padded, y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.6038009524345398, 0.6740435361862183]
