In [None]:
!pip install keras_tuner

In [None]:
import os
import re
import pickle
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from keras.preprocessing import text, sequence
from keras.layers import Input, Dense, Embedding, Flatten, Conv2D, MaxPool2D, Bidirectional, LSTM, GRU, concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.models import Model, Sequential
from keras import backend as K
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from tensorflow.keras.callbacks import EarlyStopping
early_stop=EarlyStopping(monitor='loss', patience=10)#, verbose=1)
from sklearn.preprocessing import LabelEncoder
# keras Tuner
import keras_tuner as kt
# HP
from keras_tuner import Hyperband

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
import nltk
import string 
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
lb_enc = LabelEncoder()
from keras_tuner import RandomSearch


In [None]:
data_train = pd.read_excel('/kaggle/input/spam-review-detection/data_train.xlsx')
data_dev = pd.read_excel('/kaggle/input/spam-review-detection/data_dev.xlsx')
data_test = pd.read_excel('/kaggle/input/spam-review-detection/data_test.xlsx')

In [None]:
y_train=data_train['Label']

y_dev=data_dev['Label']

y_test=data_test['Label']

In [None]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(data_train['transformed_text'])
# tokenizer.fit_on_texts(data_dev['transformed_text'])
# tokenizer.fit_on_texts(data_test['transformed_text'])

text_to_sequence_train = tokenizer.texts_to_sequences(data_train['transformed_text']) 
text_to_sequence_dev = tokenizer.texts_to_sequences(data_dev['transformed_text']) 
text_to_sequence_test = tokenizer.texts_to_sequences(data_test['transformed_text']) 
# text_to_sequence_dev

In [None]:
max_length_sequence_train = max([len(i) for i in text_to_sequence_train])
 
padded_train = pad_sequences(text_to_sequence_train, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
padded_train = np.expand_dims(padded_train, axis=2)
max_length_sequence_train
padded_train

In [None]:
padded_dev = pad_sequences(text_to_sequence_dev, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
padded_dev = np.expand_dims(padded_dev, axis=2)
len(padded_dev)

In [None]:
padded_test = pad_sequences(text_to_sequence_test, maxlen=max_length_sequence_train, 
                                    padding = "pre") 
padded_test = np.expand_dims(padded_test, axis=2)
len(padded_test)

In [None]:
VOC_SIZE = len(tokenizer.word_index)+1 # 18359

def build_model(hp):
    model = Sequential()
    model.add(Embedding(VOC_SIZE, hp.Int('embedding_dim',  min_value=200, max_value=400, step=50), input_length=max_length_sequence_train))
    
    for i in range(hp.Int('num_layers', 1, 5)):
        model.add(LSTM(units=hp.Int('units_' + str(i), min_value=32, max_value=128, step=32), 
                       activation='tanh', recurrent_activation='sigmoid', 
                       return_sequences=(i < (hp.Int('num_layers', 1, 3) - 1))))
        model.add(Dropout(hp.Float('dropout_' + str(i), 0.2, 0.5, 0.1)))
    
    model.add(Dense(units=hp.Int('dense_units', 32, 128, 32), activation=hp.Choice('dense_activation_0', ['relu', 'tanh', 'sigmoid', 'elu'])))
    model.add(Dropout(hp.Float('dense_dropout', 0.2, 0.5, 0.1)))
    model.add(Dense(units=1, activation='sigmoid'))
#     model.add(Dense(units=1, activation=hp.Choice('dense_activation_1', ['relu', 'tanh', 'sigmoid', 'elu'])))
    
    # Epochs
    num_epochs = hp.Int('num_epochs', min_value=10, max_value=100, step=10)
    # batch_size
    batch_size=hp.Int('batch_size', 32, 128, step=32)
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', 0.0001, 0.01, step=0.0001)), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [None]:
tuner_lstm = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='my_project'
)
!rm -rf my_dir/my_project

In [None]:
hp = kt.HyperParameters()
tuner_lstm.search (padded_train, 
              y_train, 
              epochs = 10,               
              validation_data = (padded_dev, y_dev), 
              batch_size=hp.Int('batch_size', 32, 128, step=32), 
#               activation= activation,
              callbacks = [early_stop])

In [None]:
best_hps_LSTM = tuner_lstm.get_best_hyperparameters(num_trials=1)[0]
print(best_hps_LSTM.values)

In [None]:
print(f'Embedding dimension: {best_hps_LSTM.get("embedding_dim")}')
print(f'Number of hidden layers: {best_hps_LSTM.get("num_layers")}')
for i in range(best_hps_LSTM.get('num_layers')):
    print(f'Units for layer {i+1}: {best_hps_LSTM.get(f"units_{i}")}')
    print(f'Dropout for layer {i+1}: {best_hps_LSTM.get(f"dropout_{i}")}')
    if f'activation_{i}' in best_hps_LSTM:
        activation_fn = best_hps_LSTM.get(f'activation_{i}')
        print(f'Activation Function for layer {i+1}: {activation_fn}')
    else:
        print('Activation parameter does not exist.')

print(f'Dense units: {best_hps_LSTM.get("dense_units")}')
print(f'Dense activation function: {best_hps_LSTM.get("dense_activation_0")}')
print(f'Dense dropout: {best_hps_LSTM.get("dense_dropout")}')
print(f'Learning rate: {best_hps_LSTM.get("learning_rate")}')
print(f'Batch size: {best_hps_LSTM.get("batch_size")}')
print(f'Number of epochs: {best_hps_LSTM.get("num_epochs")}')

In [None]:
best_model = tuner_lstm.get_best_models ()[0]
best_model.build (padded_train.shape)
best_model.summary()

In [None]:
best_Model = tuner_lstm.hypermodel.build (best_hps_LSTM)
# best_Model = build_model(best_hps_LSTM)

In [None]:
best_Model.fit(
    padded_train,
    y_train,
    epochs=best_hps_LSTM.get('num_epochs'),
    validation_data=(padded_dev, y_dev),
    batch_size=best_hps_LSTM.get('batch_size')
)

In [None]:
from sklearn.metrics import classification_report


In [None]:
predictions = best_Model.predict(padded_train)
loss, accuracy = best_Model.evaluate(padded_train, y_train)
y_pred_train = best_Model.predict(padded_train)
y_pred_classes_train = np.round(y_pred_train)
# y_pred_classes = np.argmax(y_pred, axis=1)
f1_macro = f1_score(y_train, y_pred_classes_train, average='macro')
print("F1 Score (macro): ", f1_macro)
# print(classification_report(y_train, y_pred_classes))

In [None]:
predictions = best_Model.predict(padded_dev)
loss, accuracy = best_Model.evaluate(padded_dev, y_dev)
y_pred_dev = best_Model.predict(padded_dev)
y_pred_classes_dev = np.round(y_pred_dev)
f1_macro = f1_score(y_dev, y_pred_classes_dev, average='macro')
print("F1 Score (macro): ", f1_macro)
# print(classification_report(y_dev, y_pred_classes))

In [None]:
predictions = best_Model.predict(padded_test)
loss, accuracy = best_Model.evaluate(padded_test, y_test)
y_pred_test = best_Model.predict(padded_test)
y_pred_classes_test = np.round(y_pred_test)
# y_pred_classes = np.argmax(y_pred, axis=1)
f1_macro = f1_score(y_test, y_pred_classes_test, average='macro')
print("F1 Score (macro): ", f1_macro)
# print(classification_report(y_test, y_pred_classes))