In [37]:
# import optuna
# # import wandb
# import logging
# import sys
# import os
import tensorflow as tf
import json
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, LSTM, Dropout, Bidirectional, MaxPooling1D, GlobalAveragePooling1D, AdditiveAttention, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import AdditiveAttention, Concatenate, BatchNormalization, Activation, MultiHeadAttention, LayerNormalization, TextVectorization, Masking, Reshape
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, Callback

In [20]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

# Last inn 'y_train_LSTM' fra en CSV-fil
y_df = pd.read_csv(url_data+'y_data.csv')
# Konverter hele DataFrame til et NumPy array
y = y_df.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()

In [21]:
# Last inn 'X_train_LSTM' fra en CSV-fil
X_df = pd.read_csv(url_data+'new_df.csv')

X_1= X_df['All_text']

# Konverter kolonnen til et NumPy array
X = X_df['All_text'].to_numpy().flatten()

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token='<OOV>') # Hyperparameters = num_words=vocab_size, oov_token=oov_tok
tokenizer.fit_on_texts(X_1)

#Creating a word index of the words from the tokenizer 
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.\n\nFirst 10 is listen below:')
print(dict(list(word_index.items())[0:10]))

Found 34985 unique tokens.

First 10 is listen below:
{'<OOV>': 1, 'the': 2, 'to': 3, 'of': 4, 'and': 5, 'a': 6, 'in': 7, '0': 8, 'is': 9, 'that': 10}


## Setting hyperparameters

In [23]:
# Defining pre-processing hyperparameters for the networks
max_len = 100
trunc_type = "post"
padding_type = "post"
vocab_size = len(word_index)
# This is fixed.
embedding_dim = 100
EPOCHS=20
BATCH_SIZE = 32
num_classes = 1

## Create datasets

In [24]:
# Create a tf.data.Dataset from texts and labels
dataset = tf.data.Dataset.from_tensor_slices((X_1, y))

# Shuffle the dataset (if needed)
dataset = dataset.shuffle(buffer_size=len(X_1), reshuffle_each_iteration=False)

# Determine split sizes
train_size = int(0.7 * len(X_1))
val_size = int(0.15 * len(X_1))
test_size = len(X_1) - train_size - val_size

# Split the dataset
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)
val_dataset = test_dataset.skip(test_size)
test_dataset = test_dataset.take(test_size)

In [25]:
int_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=max_len
)

hot_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode='multi_hot',
)

# Prepare the data for adaptation
all_texts = dataset.map(lambda x, y: x)

int_vectorization.adapt(all_texts)
hot_vectorization.adapt(all_texts)

In [26]:
def vectorize_text(text, label, vectorize_layer):
    text = vectorize_layer(text)
    return text, label

train_dataset = train_dataset.map(lambda x, y: vectorize_text(x, y, hot_vectorization))
val_dataset = val_dataset.map(lambda x, y: vectorize_text(x, y, hot_vectorization))
test_dataset = test_dataset.map(lambda x, y: vectorize_text(x, y, hot_vectorization))

## Batch the data

In [27]:
batch_size = 32

train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [30]:
class ModelConfig:
    def __init__(self, max_len, num_classes, embeddings_GloVe):
        self.max_len = max_len
        self.num_classes = num_classes
        self.embeddings_GloVe = embeddings_GloVe


# Create a configuration object
config = ModelConfig(max_len=max_len, num_classes=num_classes, embeddings_GloVe=embeddings_GloVe)

In [60]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Conv1D, MaxPooling1D, Flatten, Dense, concatenate
from tensorflow.keras.models import Model

def CNN_LSTM_parallel(params, config):
    
    # Define two separate inputs
    # input_text_for_lstm = Input(shape=(max_len,), dtype=tf.string)
    # input_text = Input(shape=(vocab_size,), dtype=tf.string)

    # # LSTM Branch
    # embedding_layer = Embedding(input_dim=embeddings_GloVe.shape[0],
    #                             output_dim=embeddings_GloVe.shape[1],
    #                             weights=[embeddings_GloVe],
    #                             trainable=False)
    # # masked_input = Masking(mask_value=0)(input_text_for_lstm)  # Masking layer added to ignore zeros (padding)
    # embedded_seq = embedding_layer(input_text_for_lstm)
    # dropout = Dropout(params['dropout_rate'])(embedded_seq)
    # lstm = LSTM(params['lstm_units'])(dropout)
    # lstm = LayerNormalization()(lstm)

    # CNN Branch
    reshape_layer = Reshape((vocab_size, 1))  # Reshape to include the channel dimension
    input_text_for_cnn = Input(shape=(vocab_size,))
    reshaped_input = reshape_layer(input_text_for_cnn)


    cnn = Conv1D(params['conv_filters'], 1, activation='relu')(reshaped_input)
    cnn = BatchNormalization()(cnn)
    # cnn = MaxPooling1D(2)(dense)
    cnn = Flatten()(cnn)

    # # Concatenate
    # concatenated = concatenate([lstm, cnn])

    # dense = Dense(params['dense_2_units'])(input_text)


    # num_heads = 8 # This should divide embed_dim evenly
    # attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embeddings_GloVe.shape[1] // num_heads, dropout=0.0)
    # attention_output = attention_layer(query=dense, key=dense, value=dense)
    # attention_output= LayerNormalization()(attention_output)

    # Output layer
    output = Dense(2, activation='softmax')(cnn)  # Assuming 10 classes

    # Build the model
    model = Model(inputs=[input_text_for_cnn], outputs=output)

    model.compile(optimizer=Adam(params['learning_rate']), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [32]:
with open('best_trial_length_100.json', 'r') as f:
    trial = json.load(f)
    params = trial['params']

In [52]:
params

{'lstm_units': 140,
 'dense_2_units': 150,
 'dropout_rate': 0.42803898610506674,
 'learning_rate': 0.000922823163674921,
 'conv_filters': 52}

In [61]:
model = CNN_LSTM_parallel(params, config)

In [62]:
# Compile and train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=20, verbose=1)

Epoch 1/20