In [1]:
!pip install --upgrade tensorflow



In [2]:
!pip install -U fasttext



In [3]:
!nvidia-smi

Sat Feb 17 17:08:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

##Load the dataset

In [4]:
!wget https://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
!tar xzf review_polarity.tar.gz

--2024-02-17 17:08:11--  https://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3127238 (3.0M) [application/x-gzip]
Saving to: ‘review_polarity.tar.gz.1’


2024-02-17 17:08:12 (5.80 MB/s) - ‘review_polarity.tar.gz.1’ saved [3127238/3127238]



In [5]:
import os
from sklearn.datasets import load_files


dataset_path = 'txt_sentoken'
movie_reviews = load_files(container_path = dataset_path, encoding = 'utf-8')


x = movie_reviews.data            #the data
y = movie_reviews.target          #the labels
z = movie_reviews.target_names    #the names of labels

In [6]:
x[:1]

["arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is 

In [7]:
y

array([0, 1, 1, ..., 1, 0, 0])

In [8]:
z

['neg', 'pos']

##Pre-processing

The english stopwords is a package of 179 words that in general, would not help in a sentiment analysis problem. But, since they include terms that are negative, removing them could prove harmful for our case.

e.g. imagine the phrase "I didn't like the film" to end up "like film".

So, the plan is to remove all the stop words that include negative meaning before the preprocessing.

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

From these words, we will decide which ones to keep because in fact they have a meaningful impact in our sentiment analysis problem, as we stated earlier.

In [10]:
set_stop_words = set(stopwords.words('english'))

to_keep_words = ['not', "don't", "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't" , "shouldn't", "haven't", "wasn't", "weren't",  "isn't", "doesn"]
to_keep_words

['not',
 "don't",
 "aren't",
 "couldn't",
 "didn't",
 "doesn't",
 "hadn't",
 "hasn't",
 "shouldn't",
 "haven't",
 "wasn't",
 "weren't",
 "isn't",
 'doesn']

In [11]:
stopwords_updated = set(stopwords.words('english')) - set(to_keep_words)
print(len(stopwords.words('english')))
print(len(to_keep_words))
print(len(stopwords_updated))

179
14
165


In [12]:
from nltk.stem import WordNetLemmatizer
import re
from tqdm.auto import tqdm
import string
nltk.download('wordnet')

def pre_process_text(text):
    ''' Function to preprocess text.
     input: initial text
     output: processed text
     Performs pre-processing methods:
        1. Combination to a single document.
        2. Convertion to lowercase.
        3. Lemmatization and stop words extraction
        4. Punctuation removal
        5. Number removal
        6. Single characters removal
        7. Converting multiple spaces to single ones
        '''
    lemmatizer = WordNetLemmatizer()
    all_docs = []

    single_char = re.compile(r'\s+[a-z]\s+')                                          #6. Remove single characters
    multiple_space= re.compile(r'\s+')                                                 #7. Replace multiple space with a single one

    stopwords_updated = set(stopwords.words('english')) - set(to_keep_words)
    for document in tqdm(x):

        combined_text = ' '.join(text)            #1.Combine in one single document

        combined_text = combined_text.lower()    #2. Convert to lowercase
        combined_text = [lemmatizer.lemmatize(word) for word in document.split() if word not in stopwords_updated]  # 3.Lemmatize and remove stop words


        combined_text = ' '.join(combined_text)

        combined_text = ''.join([char for char in combined_text if char not in string.punctuation])   #4.remove punctuation
        combined_text = ''.join([char for char in combined_text if not char.isdigit()])     #5.remove numbers

        res = single_char.sub(combined_text, '')
        res2 = multiple_space.sub(combined_text, ' ')
        all_docs.append(combined_text)

    return all_docs

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
processed_text = pre_process_text(x)

processed_text[:1]

  0%|          | 0/2000 [00:00<?, ?it/s]

['arnold schwarzenegger icon action enthusiast  since late s  lately film sloppy oneliner getting worse  hard seeing arnold mr  freeze batman robin  especially say ton ice joke  hey got  million  whats matter  arnold signed another expensive blockbuster  cant compare like terminator series  true lie even eraser  called dark thriller  devil  gabriel byrne  come upon earth  impregnate woman  robin tunney  happens every  year  basically destroy world  apparently god chosen one man  one man jericho cane  arnold   help trusty sidekick  kevin pollack   stop nothing let devil take world  part actually absurd  would fit right dogma  yes  film weak  better blockbuster right  sleepy hollow   make world not enough look like  star film  anyway  definitely doesnt seem like arnold movie  wasnt type film see  sure gave u chuckle well known oneliner  seemed confused character film going  understandable  especially ending changed according source  aside form  still walked  much like past film  im sorry

##Splitting into training set (70%), development set (15%) and test set (15%)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(processed_text, y, test_size=0.3, random_state=17)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=25)

training_text = ' '.join(X_train)                    #Flatten into a single string
development_text = ' '.join(X_dev)
test_text = ' '.join(X_test)

training_words = training_text.split()
development_words = development_text.split()
test_words = test_text.split()

training_vocab = set(training_words)
development_vocab = set(development_words)
test_vocab = set(test_words)

In [15]:
print("Training set size (in documents): ", len(y_train))
print("Development set size (in documents): ", len(y_dev))
print("Test set size (in documents): ", len(y_test))
print("Full size (sanity check): ", len(y_train) + len(y_dev) + len(y_test))
print("---------------------------------")
print("Training vocabulary size (in words): " , len(training_vocab))
print("Development vocabulary size (in words): ", len(development_vocab))
print("Test vocabulary size (in words): ", len(test_vocab))
print("Full vocabulary size (in words): ", len(training_vocab) + len(development_vocab) + len(test_vocab))

Training set size (in documents):  1400
Development set size (in documents):  300
Test set size (in documents):  300
Full size (sanity check):  2000
---------------------------------
Training vocabulary size (in words):  36624
Development vocabulary size (in words):  16948
Test vocabulary size (in words):  16780
Full vocabulary size (in words):  70352


In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS)
print(len(STOP_WORDS))
extra_to_keep = ["n't", "not", "no"]
for i in range(len(extra_to_keep)):
  to_keep_words.append(extra_to_keep[i])

to_keep_words


{'across', 'everything', 'almost', '‘re', 'wherein', 'our', 'what', 'whereafter', 'whither', 'onto', 'now', 'name', 'itself', 'afterwards', 'yet', 'whenever', 'again', 'am', 'this', 'about', 'herein', 'noone', 'call', 'except', 'became', 'i', 'something', 'somewhere', 'same', 'go', 'therefore', 'ever', 'front', 'wherever', 'third', 'sometimes', 'herself', '‘s', "'re", 'eight', 'empty', '’ve', 'its', 'another', 'without', 'if', 'anyhow', 'less', 'many', 'out', 'hence', 'does', 'why', 'hereupon', 'serious', 'whoever', 'hundred', 'twelve', 'someone', 'fifty', 'show', 'forty', 'seems', 'n‘t', 'everyone', 'they', 'get', 'can', 'sometime', 'who', 'otherwise', 'together', 'n’t', 'it', 'whether', 'you', 'various', 'too', 'enough', 'although', 'becomes', 'doing', 'done', 'us', 'mine', 'part', 'whatever', 'along', 'did', 'any', 'are', 'below', 'on', 'formerly', 'indeed', 'twenty', 'keep', 'whom', 'side', 'themselves', 'beside', 'nor', 'really', 'put', 'sixty', 'eleven', 'is', 'further', 'himself

['not',
 "don't",
 "aren't",
 "couldn't",
 "didn't",
 "doesn't",
 "hadn't",
 "hasn't",
 "shouldn't",
 "haven't",
 "wasn't",
 "weren't",
 "isn't",
 'doesn',
 "n't",
 'not',
 'no']

In [17]:
stop_words_updated = STOP_WORDS - set(to_keep_words)
print(len(stop_words_updated))

323


In [18]:
nlp = spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7b6f25fc3a80>

##Use spaCy for sentence splitting & tokenization

In [19]:
def tokenize_sent(x):

  X_tokenized = []
  for idx in tqdm(range(len(x))):
    doc = nlp(x[idx])
    tokens = []
    for sent in doc.sents:
      for tok in sent:
        if '\n' in tok.text or "\t" in tok.text or "--" in tok.text or "*" in tok.text or tok.text.lower() in stop_words_updated or tok.text in string.punctuation or all(x in string.punctuation for x in tok.text):
          continue
        if tok.text.strip():
          tokens.append(tok.text.replace('"',"'").strip().lower())
    X_tokenized.append(tokens)
  return X_tokenized

In [20]:
X_train_tokenized = tokenize_sent(X_train)
X_dev_tokenized = tokenize_sent(X_dev)
X_test_tokenized = tokenize_sent(X_test)

  0%|          | 0/1400 [00:00<?, ?it/s]



  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

In [21]:
import numpy as np

# Get mean and std of sequence length on training set
mean = np.mean([len(x) for x in X_train_tokenized])
std = np.std([len(x) for x in X_train_tokenized])
print("Mean of sequence length on training set:", mean)
print("Standard deviation of sequence length on training set:", std)

Mean of sequence length on training set: 312.97285714285715
Standard deviation of sequence length on training set: 134.5485652324931


In [22]:
print(X_train[0])
print(X_train_tokenized[0])


not many people know james whale  safe bet good chunk seen movie  believe semi biographical god monster  whale would wanted way  insightful  haunting exploration last day frankenstein bride frankenstein director  notable introducing one first complicated gay character hollywood movie  god monster interest biopic whales life track final day life  probably better movie  focus whales  ian mckellen  untraditional deceptive lust heterosexual gardener clayton boone  brendan fraser   begin whale  fairly talented artist  aside director  asking boone  sit    pose portrait   first  boone doesnt realize whale gay grows fascinated old man  discover whales sexual orientation dedicated protective maid  lynn redgrave   frightened  refusing sit guy  come back  storm  disgusted whales  locker room talk   boone return yet another time  whale promise tone aforementioned  locker room talk   find platonic relationship strengthening  meanwhile  jimmy suffering hallucination mental attack result stroke not l

##Tokenize, convert text (sequence of words) to sequence of indexes and PAD the sequences

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import math

MAX_WORDS = 100000
MAX_SEQUENCE_LENGTH =  math.ceil(mean + std)
EMBEDDING_DIM = 300              #to be changed

tokenizer = Tokenizer(num_words = MAX_WORDS, oov_token ='UNK')
tokenizer.fit_on_texts([" ".join(x) for x in X_train_tokenized])

word_index = tokenizer.word_index


In [24]:
train_sequences = tokenizer.texts_to_sequences([" ".join(x) for x in X_train_tokenized])
dev_sequences = tokenizer.texts_to_sequences([" ".join(x) for x in X_dev_tokenized])
test_sequences = tokenizer.texts_to_sequences([" ".join(x) for x in X_test_tokenized])

train_pad = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding ='post')
dev_pad = pad_sequences(dev_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding ='post')
test_pad = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding ='post')

In [25]:
print(train_sequences[0])
print("------------------------------")
print(train_pad[0])

[5, 21, 19, 172, 2072, 1378, 1887, 11, 4996, 47, 3, 226, 10828, 12688, 363, 567, 2072, 615, 13, 3289, 1486, 4069, 45, 2294, 2744, 2294, 34, 1888, 4491, 2295, 1015, 7, 110, 3, 363, 567, 343, 9480, 7054, 14, 715, 206, 45, 14, 114, 46, 3, 524, 7054, 1983, 15531, 20585, 12689, 4492, 6522, 7684, 6523, 4493, 4754, 3290, 106, 2072, 736, 670, 975, 859, 34, 1648, 4493, 765, 3906, 2406, 4493, 4, 593, 2072, 1015, 1349, 4755, 71, 26, 950, 7054, 568, 15532, 3050, 7685, 4494, 8470, 7686, 3742, 4495, 765, 48, 17, 1649, 7687, 7054, 6524, 284, 288, 4493, 302, 9, 2072, 976, 705, 1841, 6524, 284, 288, 37, 20586, 151, 20587, 1431, 2034, 6525, 1790, 688, 244, 3424, 5, 72, 442, 15533, 136, 3743, 4070, 1016, 54, 443, 766, 4493, 20588, 42, 1683, 6526, 363, 567, 4754, 15534, 10829, 24, 212, 2, 5663, 6058, 65, 30, 2838, 2073, 3, 4071, 219, 245, 1159, 4496, 30, 3291, 430, 767, 134, 4072, 10830, 569, 1729, 3290, 224, 276, 847, 463, 1002, 737, 832, 1541, 5, 3292, 12690, 431, 4073, 67, 3907, 1487, 1309, 24, 1350, 4

##Download and unzip fasttext binary model for word embeddings

In [27]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

--2024-02-17 17:12:14--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.78, 13.226.210.25, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz.1’


2024-02-17 17:12:43 (149 MB/s) - ‘cc.en.300.bin.gz.1’ saved [4503593528/4503593528]



##Create embedding matrix

**Process**: The loop iterates over the word_index dictionary, which contains word-to-index mappings generated by the tokenizer. For each word in the word_index, it checks if the index is within the limit of MAX_WORDS. If so, it retrieves the corresponding word vector from the FastText model using fasttext_model.get_word_vector(word=key) and assigns it to the corresponding row in the embedding_matrix.

In [28]:
import fasttext

print("Loading embeddings model..")
fasttext_model = fasttext.load_model('cc.en.300.bin')
embedding_matrix = np.zeros(shape=((MAX_WORDS + 2), 300))          # +2 because we have reserved indices for padding and out-of-vocabulary tokens

for key, value in word_index.items():
    if value <= MAX_WORDS:
        embedding_matrix[value] = fasttext_model.get_word_vector(word=key)            #create embedding matrix

del fasttext_model      #save memory

Loading embeddings model..




##Create one-hot vectors

In [29]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
target_list = z

y_train_1_hot = lb.fit_transform([target_list[x] for x in y_train])
y_dev_1_hot = lb.transform([target_list[x] for x in y_dev])

#y_train_1_hot = np.argmax(y_train_1_hot, axis=1)
#y_dev_1_hot = np.argmax(y_dev_1_hot, axis=1)

#print('y_dev_1_hot:', y_dev_1_hot)                             #prints vertically
print('y_dev_1_hot:', ' '.join(map(str, y_dev_1_hot)))          #prints horizontally


y_dev_1_hot: [0] [1] [1] [0] [0] [1] [0] [1] [1] [1] [0] [1] [0] [1] [0] [1] [0] [1] [0] [0] [0] [1] [0] [1] [1] [1] [0] [0] [0] [0] [0] [1] [0] [0] [1] [1] [1] [0] [1] [1] [1] [0] [1] [1] [1] [1] [0] [0] [1] [0] [0] [0] [0] [1] [1] [1] [0] [0] [0] [0] [1] [1] [1] [1] [0] [0] [0] [0] [0] [0] [1] [0] [1] [1] [0] [1] [1] [0] [1] [0] [0] [0] [1] [0] [1] [1] [1] [0] [1] [1] [0] [0] [0] [0] [0] [1] [0] [1] [1] [0] [1] [0] [0] [1] [1] [1] [0] [0] [0] [0] [1] [0] [1] [0] [1] [0] [0] [1] [0] [1] [0] [0] [0] [0] [0] [1] [0] [0] [0] [0] [0] [0] [1] [0] [1] [0] [0] [0] [0] [0] [0] [0] [0] [1] [1] [0] [1] [0] [1] [1] [1] [1] [1] [0] [0] [0] [0] [1] [0] [1] [0] [1] [0] [1] [0] [1] [0] [1] [1] [1] [0] [0] [0] [1] [0] [0] [1] [1] [1] [1] [1] [1] [1] [0] [0] [1] [0] [0] [0] [1] [1] [1] [1] [1] [1] [1] [1] [1] [0] [1] [1] [0] [0] [0] [0] [1] [0] [1] [0] [1] [0] [1] [0] [0] [1] [1] [1] [1] [0] [0] [0] [0] [0] [1] [1] [1] [0] [1] [1] [1] [0] [0] [0] [0] [1] [0] [1] [0] [1] [1] [0] [0] [1] [0] [0] [0] [0]

In [30]:
import tensorflow as tf

class Metrics(tf.keras.callbacks.Callback):

    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)
        val_targ = tf.cast(val_targ,dtype=tf.float32)
        _val_f1 = f1_score(val_targ, val_predict,average="weighted")
        _val_recall = recall_score(val_targ, val_predict,average="weighted")
        _val_precision = precision_score(val_targ, val_predict,average="weighted")
        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" % (_val_f1, _val_precision, _val_recall))
        return

##Self attention class

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Layer

class SelfAttention(tf.keras.layers.Layer):
  def __init__(self, mlp_layers=0, units=0, dropout_rate=0, return_attention=False, **kwargs):
    super(SelfAttention, self).__init__(**kwargs)
    self.mlp_layers = mlp_layers
    self.mlp_units = units
    self.return_attention = return_attention
    self.dropout_rate = dropout_rate
    self.attention_mlp = self.build_mlp()

  def build_mlp(self):
    mlp = Sequential()
    for i in range(self.mlp_layers):
      mlp.add(Dense(self.mlp_units, activation='relu'))
      mlp.add(Dropout(self.dropout_rate))
    mlp.add(Dense(1))
    return mlp

  def call(self, x, mask=None):
    a = self.attention_mlp(x)
    a = tf.squeeze(a, axis=2)

    if mask is not None:
      mask = tf.keras.backend.cast(mask, tf.keras.backend.floatx())
      a -= 100000.0 * (1.0 - mask)

    a = tf.keras.backend.expand_dims(tf.keras.backend.softmax(a, axis=-1))
    weighted_input = x * a
    result = tf.keras.backend.sum(weighted_input, axis=1)

    if self.return_attention:
      return [result, a]
    return result

In [32]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_dev = tf.keras.utils.to_categorical(y_dev, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

##RNN model

In [33]:

def build_model(hp):
    GRU_SIZE = hp.Int('gru_size', min_value=100, max_value=500, step=50)
    dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.05)
    mlp_layers = hp.Int('mlp_layers', min_value=1, max_value=5, step=1)
    mlp_units = hp.Int('mlp_units', min_value=64, max_value=512, step=64)

    model = tf.keras.Sequential()

    model.add(
        tf.keras.layers.Embedding(
            MAX_WORDS+2,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            mask_zero=True,
            trainable=False
        )
    )
    model.add(tf.keras.layers.Dropout(dropout_rate))

    model.add(
        tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(
                GRU_SIZE,
                return_sequences=True,
                recurrent_dropout=dropout_rate
            )
        )
    )
    model.add(tf.keras.layers.Dropout(dropout_rate))

    model.add(SelfAttention(mlp_layers=mlp_layers, units=mlp_units))

    model.add(tf.keras.layers.Dense(1024, activation='relu'))
    model.add(tf.keras.layers.Dropout(dropout_rate))

    model.add(tf.keras.layers.Dense(2, activation='softmax'))

    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=["categorical_accuracy"]
    )

    return model

##Hyperparameter search with Keras Tuner

In [34]:
!pip install keras-tuner



In [37]:
from kerastuner.tuners import RandomSearch

tuner = RandomSearch(
    build_model,
    objective='val_categorical_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='keras_tuner_logs',
    project_name='hyperparameter_tuning_gru_self_attention'
)


tuner.search(
    train_pad,
    y_train,
    validation_data=(dev_pad, y_dev),
    batch_size=256,
    epochs=50,
    shuffle=True,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_categorical_accuracy', patience=10, restore_best_weights=True)
    ]
)


best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]




Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
300               |300               |gru_size
0.3               |0.3               |dropout_rate
4                 |4                 |mlp_layers
256               |256               |mlp_units





Epoch 1/50

KeyboardInterrupt: 