In [1]:
import optuna
# import wandb
import logging
import sys
import os
import tensorflow as tf
import json
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, LSTM, Dropout, Bidirectional, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Concatenate, BatchNormalization, MultiHeadAttention, LayerNormalization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, Callback
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer





In [37]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

# Last inn 'y_train_LSTM' fra en CSV-fil
y_df = pd.read_csv(url_data+'y_liberal.csv')
# Konverter hele DataFrame til et NumPy array
y = y_df.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()

# Last inn 'X_train_LSTM' fra en CSV-fil
X_df = pd.read_csv(url_data+'new_df.csv')

In [38]:
for col in X_df.columns:
    X_df[col] = X_df[col].astype(str)

In [42]:
X_1= X_df['with_stopwords']

# Konverter kolonnen til et NumPy array
X = X_1.to_numpy().flatten()

In [5]:
all_texts_length = X_df['without_stopwords'].apply(lambda x: len(x.split()))
# Now, let's analyze the distribution of these sequence lengths
all_texts_length.describe()

count    12854.000000
mean        47.516104
std        182.683952
min          1.000000
25%         10.000000
50%         13.000000
75%         23.000000
max       5634.000000
Name: without_stopwords, dtype: float64

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=0.01, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Get feature names to later identify words by index
feature_names = tfidf_vectorizer.get_feature_names_out()

# Calculate the average TF-IDF score for each word across all documents
avg_scores = np.mean(X_tfidf.toarray(), axis=0)

# Set a threshold, for example, the mean of average scores
threshold = np.mean(avg_scores)

# Filter words that have a score above the threshold
important_words = [feature_names[i] for i in range(len(feature_names)) if avg_scores[i] > threshold]

In [43]:
# Filter corpus to keep only important words
filtered_texts = []
for doc in X_1:
    tokens = doc.split()
    filtered_tokens = [token for token in tokens if token in important_words]
    filtered_texts.append(' '.join(filtered_tokens))

# Tokenize texts
tokenizer = Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to have the same length
max_seq_length = max(len(seq) for seq in sequences)

In [44]:
#Creating a word index of the words from the tokenizer 
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.\n\nFirst 10 is listen below:')
print(dict(list(word_index.items())[0:10]))

Found 22333 unique tokens.

First 10 is listen below:
{'OOV': 1, 'nan': 2, 'people': 3, 'like': 4, 'work': 5, 'right': 6, 'trump': 7, 'think': 8, 'state': 9, 'government': 10}


In [51]:
# Defining pre-processing hyperparameters for the networks
max_len = 100
trunc_type = "post"
padding_type = "post"
vocab_size = len(word_index)
# This is fixed.
embedding_dim = 100
EPOCHS=20
BATCH_SIZE = 32
num_classes = 1

# Padding the sequences to keep the lengths uniform
X = pad_sequences(sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
# print('Shape of data tensor:', X_tensorflow.shape)

In [52]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

In [47]:
class ModelConfig:
    def __init__(self, max_len, num_classes, embeddings_GloVe):
        self.max_len = max_len
        self.num_classes = num_classes
        self.embeddings_GloVe = embeddings_GloVe


# Create a configuration object
config = ModelConfig(max_len=max_len, num_classes=num_classes, embeddings_GloVe=embeddings_GloVe)

In [54]:
def CNN_LSTM_sequential(params, config):
    input_layer = Input(shape=(config.max_len,), dtype='int32')
    
    # Use config object for fixed parameters such as embeddings
    embedding = Embedding(input_dim=config.embeddings_GloVe.shape[0],
                          output_dim=config.embeddings_GloVe.shape[1],
                        #   weights=[config.embeddings_GloVe],
                          trainable=True)(input_layer)
    
    # Use params dictionary for dynamic hyperparameters
    dropout = Dropout(params['dropout_rate'])(embedding)

    conv = Conv1D(filters=params['conv_filters'], kernel_size=1, activation='relu')(dropout)
    conv = BatchNormalization()(conv)

    lstm = Bidirectional(LSTM(params['lstm_units'], return_sequences=True, dropout=0.006, recurrent_dropout=0.1))(conv)
    lstm = LayerNormalization()(lstm)
    
    num_heads = 8
    attention_layer = MultiHeadAttention(num_heads=num_heads, key_dim=config.embeddings_GloVe.shape[1] // num_heads, dropout=0.1)
    attention_output = attention_layer(query=lstm, key=lstm, value=lstm)
    attention_output = LayerNormalization()(attention_output)

    dense = Dense(params['dense_2_units'], activation='relu')(attention_output)
    dense = BatchNormalization()(dense)
    output = Dense(config.num_classes, activation='sigmoid')(dense)
    
    model = Model(inputs=input_layer, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(params['learning_rate']), metrics=['accuracy'])
    
    return model

In [13]:
import sys
import logging
import optuna
from tensorflow.keras.callbacks import EarlyStopping
from optuna.integration import KerasPruningCallback
import optuna_dashboard
import os

In [14]:
# # Define the objective function including the logging of each trial's outcome
# def objective(trial):
#     # wandb.init(project="optuna_with_wandb", entity="your_entity_here", reinit=True)
#     params = {
#         'lstm_units': trial.suggest_int('lstm_units', 135, 145, step=1),
#         # 'dense_1_units': trial.suggest_int('dense_1_units', 100, 150, step=5),
#         'dense_2_units': trial.suggest_int('dense_2_units', 140, 160, step=2),
#         'dropout_rate': trial.suggest_float('dropout_rate', 0.3, 0.5),
#         # 'lstm_dropout': trial.suggest_float('lstm_dropout', 0.0, 0.2),
#         # 'lstm_recurrent': trial.suggest_float('lstm_recurrent', 0.0, 0.2),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
#         'conv_filters': trial.suggest_int('conv_filters', 46, 60, step=2)
#     }

#     # # Log hyperparameters to wandb
#     # wandb.config.update(params)

#     # Build and train the model
#     model = CNN_LSTM_sequential(params, config)

#     #Callbacks
#     early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
#     pruning_callback = KerasPruningCallback(trial, 'val_loss')  # Create a pruning callback
#     # Fit the model
#     history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10,
#                         callbacks=[early_stopping, pruning_callback], batch_size=32, verbose=1)

#     # Evaluate the model
#     loss, accuracy = model.evaluate(X_val, y_val, verbose=1)
    
#     # Log final metrics to wandb
#     # wandb.log({"loss": loss, "accuracy": accuracy})

#     # Ensure wandb run is finished after each trial
#     # wandb.finish()
    
#     # trial.report(accuracy, 1)  # Report the accuracy to Optuna
#     return accuracy 


# # Setup Optuna with persistent storage
# storage_url = "sqlite:///db.sqlite3"
# study_name = 'Proto_koer'
# # optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
# study = optuna.create_study(direction='maximize', 
#                             sampler=optuna.samplers.TPESampler(), 
#                             study_name=study_name, 
#                             storage=storage_url, 
#                             pruner=optuna.pruners.MedianPruner(),  # Adding a pruner
#                             load_if_exists=True)

# study.optimize(objective, n_trials=100)
# print(f"Best value: {study.best_value} (params: {study.best_params})")

In [15]:
# # Data to be saved as JSON
# data = {
#     "params": {
#         "lstm_units": 135,
#         "dense_2_units": 160,
#         "dropout_rate": 0.43832284397692234,
#         "learning_rate": 0.0004893640558066397,
#         "conv_filters": 56
#     },
#     "best_accuracy": 0.73934644460678
# }

# # File path for JSON file
# file_path = 'best_trial_tf_idf.json'

# # Saving data as JSON
# with open(file_path, 'w') as file:
#     json.dump(data, file)

In [49]:
# Loading the params from CNN-LSTM from the saved JSON-fil
with open(r'C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\git_NLP_Notebooks\best_trial_tf_idf.json', 'r') as f:
    data = json.load(f)
    params = data['params']

In [55]:
best_model = CNN_LSTM_sequential(params, config)

best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=32, verbose=1)
# Retrain on the full training data

# Evaluate on the test data
loss_1, accuracy_1 = best_model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy_1}")

Epoch 1/30
  2/282 [..............................] - ETA: 1:16 - loss: 1.4209 - accuracy: 0.4859 

InvalidArgumentError: Graph execution error:

Detected at node model_4/embedding_4/embedding_lookup defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\bugat\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "c:\Users\bugat\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "c:\Users\bugat\anaconda3\Lib\asyncio\windows_events.py", line 321, in run_forever

  File "c:\Users\bugat\anaconda3\Lib\asyncio\base_events.py", line 607, in run_forever

  File "c:\Users\bugat\anaconda3\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "c:\Users\bugat\anaconda3\Lib\asyncio\events.py", line 80, in _run

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "c:\Users\bugat\anaconda3\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3051, in run_cell

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3106, in _run_cell

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3311, in run_cell_async

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3493, in run_ast_nodes

  File "c:\Users\bugat\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code

  File "C:\Users\bugat\AppData\Local\Temp\ipykernel_38848\2982834495.py", line 3, in <module>

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1150, in train_step

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 590, in __call__

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\functional.py", line 515, in call

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\functional.py", line 672, in _run_internal_graph

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\bugat\anaconda3\Lib\site-packages\keras\src\layers\core\embedding.py", line 272, in call

indices[9,0] = 22322 is not in [0, 22235)
	 [[{{node model_4/embedding_4/embedding_lookup}}]] [Op:__inference_train_function_26503]

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}')

NameError: name 'model' is not defined

In [None]:
plt.figure(figsize=(9,7))
plt.title('Accuracy score')
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'])
plt.show()
plt.figure(figsize=(9,7))
plt.title('Loss value')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()