In [1]:
# import optuna
# # import wandb
# import logging
# import sys
# import os
import tensorflow as tf
import json
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, LSTM, Dropout, Bidirectional, MaxPooling1D, GlobalAveragePooling1D, AdditiveAttention, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import AdditiveAttention, Concatenate, BatchNormalization, Activation, MultiHeadAttention, LayerNormalization, TextVectorization, Masking, Reshape
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

# Last inn 'y_train_LSTM' fra en CSV-fil
y_df = pd.read_csv(url_data+'y_liberal.csv')
# Konverter hele DataFrame til et NumPy array

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()

# Last inn 'X_train_LSTM' fra en CSV-fil
X_df = pd.read_csv(url_data+'new_df.csv')

X_df['y_liberal'] = y_df

In [3]:
# Assuming your DataFrame is named 'df' and the column you want to check is named 'column_name'
empty_rows = X_df['without_stopwords'].isnull()  # Check for empty values in the column

X_df = X_df.dropna(subset=['without_stopwords'])

len(X_df)

12804

In [4]:
# Use lambda function to find the length of each row in the column
X_df['length'] = X_df['without_stopwords'].apply(lambda x: len(x))

# Print the DataFrame with the added 'length' column
print(X_df['length'].describe())

count    12804.000000
mean       171.845205
std        651.529329
min          4.000000
25%         42.000000
50%         60.000000
75%         93.000000
max      17928.000000
Name: length, dtype: float64


In [17]:
# Konverter kolonnen til et NumPy array
X = X_df['without_stopwords'].astype(str).to_numpy().flatten()
y = X_df['y_liberal'].to_numpy()

tokenizer = Tokenizer(oov_token='<OOV>') # Hyperparameters = num_words=vocab_size, oov_token=oov_tok
tokenizer.fit_on_texts(X)

#Creating a word index of the words from the tokenizer 
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.\n\nFirst 10 is listen below:')
print(dict(list(word_index.items())[0:10]))

Found 22340 unique tokens.

First 10 is listen below:
{'<OOV>': 1, 'nan': 2, 'people': 3, 'usa': 4, 'like': 5, 'work': 6, 'right': 7, 'trump': 8, 'think': 9, 'state': 10}


## Setting hyperparameters

In [6]:
# Defining pre-processing hyperparameters for the networks
max_len = 150
trunc_type = "post"
padding_type = "post"
vocab_size = len(word_index)
embedding_dim = 100
EPOCHS=20
BATCH_SIZE = 32
num_classes = 1

## Create datasets

In [28]:
y_list = np.array([[element] for element in y])
y_list

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [22]:
# Encoding the sequences from the reddit posts
X = tokenizer.texts_to_sequences(X)
# Padding the sequences to keep the lengths uniform
X = pad_sequences(X, maxlen=max_len, padding=padding_type, truncating=trunc_type)
# print('Shape of data tensor:', X_tensorflow.shape)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_list, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

## Batch the data

In [8]:
class ModelConfig:
    def __init__(self, max_len, num_classes, embeddings_GloVe):
        self.max_len = max_len
        self.num_classes = num_classes
        self.embeddings_GloVe = embeddings_GloVe


# Create a configuration object
config = ModelConfig(max_len=max_len, num_classes=num_classes, embeddings_GloVe=embeddings_GloVe)

In [16]:
y_train

array([0, 1, 1, ..., 0, 0, 1], dtype=int64)

In [29]:
def CNN_LSTM_sequential(params, config):
    input_layer = Input(shape=(config.max_len,), dtype='int32')
    
    # Use config object for fixed parameters such as embeddings
    embedding = Embedding(input_dim=config.embeddings_GloVe.shape[0],
                          output_dim=config.embeddings_GloVe.shape[1],
                          weights=[config.embeddings_GloVe],
                          trainable=False)(input_layer)
    
    # Use params dictionary for dynamic hyperparameters
    dropout = Dropout(params['dropout_rate'])(embedding)

    conv = Conv1D(filters=params['conv_filters'], kernel_size=1, activation='relu')(dropout)
    conv = BatchNormalization()(conv)

    lstm = Bidirectional(LSTM(params['lstm_units'], return_sequences=True, dropout=0.006, recurrent_dropout=0.1))(conv)
    lstm = LayerNormalization()(lstm)
    
    num_heads = 8
    attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=config.embeddings_GloVe.shape[1] // num_heads, dropout=0.1)
    attention_output = attention_layer(query=lstm, key=lstm, value=lstm)
    attention_output = LayerNormalization()(attention_output)

    dense = Dense(params['dense_2_units'], activation='relu')(attention_output)
    dense = BatchNormalization()(dense)
    output = Dense(config.num_classes, activation='sigmoid')(dense)
    
    model = Model(inputs=input_layer, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(params['learning_rate']), metrics=['accuracy'])
    
    return model

In [11]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Conv1D, MaxPooling1D, Flatten, Dense, concatenate
from tensorflow.keras.models import Model

def simple_CNN_model(params, config):
    
    # Define input
    input_text = Input(shape=(max_len,), dtype=tf.int32)
    
    # Embedding Layer
    embedding_layer = Embedding(input_dim=config.embeddings_GloVe.shape[0],
                                output_dim=config.embeddings_GloVe.shape[1],
                                weights=[embeddings_GloVe],
                                trainable=False)
    
    embedded_seq = embedding_layer(input_text)

    # # Masking Layer
    # masked_input = Masking(mask_value=0.0)(embedded_seq)
    
    # # CNN Branch
    # reshaped_input = Reshape((max_len, embeddings_GloVe.shape[1]))(masked_input)
    
    # CNN Branch
    # reshaped_input = Reshape((max_len, embeddings_GloVe.shape[1]))(embedded_seq)
    dropout_cnn = Dropout(params['dropout_rate'])(embedded_seq)
    batch = BatchNormalization()(dropout_cnn)
    cnn = Conv1D(params['conv_filters'], 3, activation='relu')(batch)
    dropout_cnn = Dropout(0.2)(cnn)
    cnn = BatchNormalization()(dropout_cnn)
    cnn = Flatten()(cnn)

    # Dense Layer
    dense = Dense(params['dense_2_units'], activation='relu')(cnn)
    dropout_final = Dropout(0.2)(dense)

    # Output layer
    output = Dense(num_classes, activation='sigmoid')(dropout_final)  # Adjust the number of classes as needed

    # Define optimizer with the specified learning rate
    optimizer = Adam(learning_rate=params['learning_rate'])
    # Build the model
    model = Model(inputs=input_text, outputs=output)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [12]:
with open(r'C:\Users\bugat\GIT\Tekstanalyse\git_NLP_Notebooks\best_trial_length_100.json', 'r') as f:
    trial = json.load(f)
    params = trial['params']

In [13]:
params

{'lstm_units': 140,
 'dense_2_units': 150,
 'dropout_rate': 0.42803898610506674,
 'learning_rate': 0.000922823163674921,
 'conv_filters': 52}

In [30]:
model = CNN_LSTM_sequential(params, config)

In [31]:
# Compile and train the model
model.fit(X_train,y_train, 
          validation_data=(X_val, y_val), 
          epochs=20,
          batch_size= 32, 
          verbose=1)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'numpy.int64\'>"})'})