In [4]:
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Concatenate, Dropout, Dense, Layer
from tensorflow.keras.models import Model
from keras_tuner import HyperModel, RandomSearch, BayesianOptimization
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras import models, layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer




In [5]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data + 'new_df.csv')

df['All_text'] = df['All_text'].replace(['U.S.', 'U.S.A.'], ['US', 'USA'], regex=True)
df['Processed'] = df['Processed'].fillna(0)
df['Processed'] = df['Processed'].astype(str)
df['All_text'] = df['All_text'].fillna(0)
df['All_text'] = df['All_text'].astype(str)

# df.to_csv('new_df.csv', index=False)

# Making the relevant columns to lists
all_texts = (df['All_text'].to_list())
texts = df['Processed'].to_list()

# Setting the wanted text for further modelling
corpus = texts

tokenizer = Tokenizer(oov_token='<OOV>') # Hyperparameters = num_words=vocab_size, oov_token=oov_tok
tokenizer.fit_on_texts(corpus)

sequences = tokenizer.texts_to_sequences(corpus)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.\n\nFirst 10 is listen below:')
print(dict(list(word_index.items())[0:10]))

Found 22234 unique tokens.

First 10 is listen below:
{'<OOV>': 1, 'people': 2, 'like': 3, 'work': 4, 'right': 5, 'trump': 6, 'think': 7, 'state': 8, 'government': 9, 'party': 10}


In [6]:
# Last inn 'X_train_LSTM' fra en CSV-fil
X_train_LSTM = pd.read_csv(url_data+'X_train.csv')
# Konverter hele DataFrame til et NumPy array
X_train_LSTM = X_train_LSTM.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
y_train_LSTM = pd.read_csv(url_data+'y_train.csv')
# Konverter hele DataFrame til et NumPy array
y_train_LSTM = y_train_LSTM.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train_LSTM, y_train_LSTM, test_size=0.2, random_state=42)

In [7]:
# # Load the TensorBoard notebook extension
# %load_ext tensorboard

from tensorboard import notebook
notebook.list() # View open TensorBoard instances

Known TensorBoard instances:
  - port 6008: logdir logs/fit (started 2:07:28 ago; pid 1956)
  - port 6006: logdir logs/hparam_tuning (started 4 days, 2:54:41 ago; pid 23884)


In [8]:
# Define hyperparameters with TensorBoard HParams API
HP_FILTERS = hp.HParam('filters', hp.Discrete([32, 35]))
HP_NUM_UNITS2 = hp.HParam('num_units2', hp.Discrete([64, 128, 152]))
HP_DROPOUT2 = hp.HParam('dropout2', hp.RealInterval(0.2, 0.6))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.RealInterval(0.001, 0.01))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd', 'rmsprop']))

In [9]:
# %tensorboard --logdir=logs/fit

In [10]:
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_len, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_len, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_len, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
class TextClassifierHyperModel(HyperModel):
    def __init__(self, input_shape, embeddings_GloVe, num_classes, parallel_blocks, include_attention=False):
        self.input_shape = input_shape
        self.embeddings_GloVe = embeddings_GloVe
        self.num_classes = num_classes
        self.parallel_blocks = parallel_blocks
        self.include_attention = include_attention  # Toggle to control attention weights output

    def build(self, hp):
        sequence_input = Input(shape=(self.input_shape,), dtype='int32')
        embedded_sequences = Embedding(input_dim=self.embeddings_GloVe.shape[0],
                                       output_dim=self.embeddings_GloVe.shape[1],
                                       weights=[self.embeddings_GloVe],
                                       trainable=False)(sequence_input)

        conv_blocks = []
        pooled_attention_outputs = []
        attention_weights_list = []  # Store attention weights if needed

        for _ in range(self.parallel_blocks):
            conv = Conv1D(filters=hp.Choice('filters', [32, 35]), kernel_size=1, activation='relu')(embedded_sequences)
            conv = GlobalMaxPooling1D()(conv)
            conv_blocks.append(conv)

            lstm = Bidirectional(LSTM(units=hp.Int('lstm_units', min_value=50, max_value=150, step=20), return_sequences=True))(embedded_sequences)
            attention_layer = BahdanauAttention(20)
            attention_output, attention_weights = attention_layer(lstm, lstm)
            pooled_attention = GlobalMaxPooling1D()(attention_output)
            pooled_attention_outputs.append(pooled_attention)
            attention_weights_list.append(attention_weights)  # Collect attention weights

        concatenated = Concatenate()(conv_blocks + pooled_attention_outputs)
        concatenated = Dropout(hp.Float('dropout2', 0.2, 0.6))(concatenated)
        output = Dense(self.num_classes, activation='sigmoid')(concatenated)

        # Decide output based on the include_attention flag
        final_outputs = [output] if not self.include_attention else [output] + attention_weights_list

        model = Model(inputs=sequence_input, outputs=final_outputs)
        optimizer_name = hp.Choice('optimizer', ['Adam'])
        learning_rate = hp.Float('learning_rate', 0.001, 0.01)
        optimizer = getattr(tf.keras.optimizers, optimizer_name)(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model

# Note: When actually using this model for training in tuner, make sure to remove attention_weights from outputs.
# Hyperparameters and settings
input_length = 20
num_classes = 2
parallel_blocks = 2
log_dir = 'logs/fit/' + datetime.now().strftime("%d-%m-%Y %H-%M-%S")

# Assuming 'embeddings_GloVe' is defined
hypermodel = TextClassifierHyperModel(input_length, embeddings_GloVe, num_classes, parallel_blocks)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=1,
    directory=log_dir,
    project_name='TextClassification'
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=0,  # No histogram computation, set to 1 or higher to compute histograms every '1' epoch or specified frequency
    update_freq='epoch'  # Log metrics and histograms every epoch (default), not every batch
)


# Assume X_train, y_train, X_val, y_val are defined
tuner.search(X_train, y_train, 
             epochs=10, 
             validation_data=(X_test, y_test),
             callbacks=[tensorboard_callback])

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
32                |32                |filters
150               |150               |lstm_units
0.58149           |0.58149           |dropout2
Adam              |Adam              |optimizer
0.0061101         |0.0061101         |learning_rate

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 