In [1]:
# pip install tensorflow==2.15.1 

In [2]:
import keras
print(keras.__version__)


2.15.0


In [3]:
import os
import tensorflow as tf
import wandb
from wandb.keras import WandbCallback
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, LSTM, Dropout, TimeDistributed, Bidirectional, Concatenate, GlobalAveragePooling1D, AdditiveAttention
from tensorflow.keras.optimizers import Adam
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data + 'new_df.csv')

df['All_text'] = df['All_text'].replace(['U.S.', 'U.S.A.'], ['US', 'USA'], regex=True)
df['Processed'] = df['Processed'].fillna(0)
df['Processed'] = df['Processed'].astype(str)
df['All_text'] = df['All_text'].fillna(0)
df['All_text'] = df['All_text'].astype(str)

# df.to_csv('new_df.csv', index=False)

# Making the relevant columns to lists
all_texts = (df['All_text'].to_list())
texts = df['Processed'].to_list()

# Setting the wanted text for further modelling
corpus = texts

tokenizer = Tokenizer(oov_token='<OOV>') # Hyperparameters = num_words=vocab_size, oov_token=oov_tok
tokenizer.fit_on_texts(corpus)

sequences = tokenizer.texts_to_sequences(corpus)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.\n\nFirst 10 is listen below:')
print(dict(list(word_index.items())[0:10]))

Found 22234 unique tokens.

First 10 is listen below:
{'<OOV>': 1, 'people': 2, 'like': 3, 'work': 4, 'right': 5, 'trump': 6, 'think': 7, 'state': 8, 'government': 9, 'party': 10}


In [5]:
df =  pd.read_csv(r'C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\git_NLP_data\file_name.csv')

In [6]:
political_lean_counts = df['Political Lean'].value_counts()
political_lean_counts

Political Lean
Liberal         8319
Conservative    4535
Name: count, dtype: int64

In [7]:
# Last inn 'X_train_LSTM' fra en CSV-fil
X_train_LSTM = pd.read_csv(url_data+'X_tensorflow.csv')
# Konverter hele DataFrame til et NumPy array
X_train_LSTM = X_train_LSTM.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
y_train_df = pd.read_csv(url_data+'y_liberal.csv')
# Konverter hele DataFrame til et NumPy array
y_train_LSTM = y_train_df.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train_LSTM, y_train_LSTM, test_size=0.2, random_state=42)

In [8]:
class TextClassifier_WB:
    def __init__(self, input_shape, embeddings_GloVe, num_classes, parallel_blocks):
        self.input_shape = input_shape
        self.embeddings_GloVe = embeddings_GloVe
        self.num_classes = num_classes
        self.parallel_blocks = parallel_blocks

    def build(self):
        config = wandb.config
        sequence_input = Input(shape=(self.input_shape,), dtype='int32')
        embedded_sequences = Embedding(input_dim=self.embeddings_GloVe.shape[0],
                                       output_dim=self.embeddings_GloVe.shape[1],
                                       weights=[self.embeddings_GloVe],
                                       trainable=False)(sequence_input)

        conv_blocks = []
        lstm_blocks = []

        for _ in range(self.parallel_blocks):
            conv = Conv1D(
                filters=config.conv_filter_units,
                kernel_size=1,
                activation='relu',
                padding='same',
                strides=1)(embedded_sequences)
            conv_dense = TimeDistributed(Dense(config.dense_units, activation='relu'))(conv)
            conv_blocks.append(conv_dense)

            lstm = Bidirectional(LSTM(
                units=config.lstm_units,
                return_sequences=True,
                dropout=config.lstm_dropout_rate,
                recurrent_dropout=config.lstm_r_dropout_rate
            ))(conv_dense)
            lstm_blocks.append(lstm)

        combined = Concatenate()(conv_blocks + lstm_blocks)
        attention_layer = AdditiveAttention(use_scale=True)
        attention_output = attention_layer([combined, combined], return_attention_scores=False)
        context_vector = GlobalAveragePooling1D()(attention_output)

        dense = Dense(units=config.dense_units, activation='relu')(context_vector)
        dropout = Dropout(config.dropout_rate)(dense)
        outdata = Dense(self.num_classes, activation='sigmoid')(dropout)
        model = Model(inputs=sequence_input, outputs=outdata)
        optimizer = Adam(learning_rate=config.learning_rate)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model

# Initialize Weights & Biases
wandb.init(project="Beast_model")

# Define the sweep configuration
sweep_config = {
    'method': 'bayes',  # or 'grid', 'random'
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'conv_filter_units': {
            'values': [30, 31, 32]
        },
        'dense_lstm_units': {
            'distribution': 'int_uniform',  # Specifies that the values should be integers
            'min': 114,  # Minimum value
            'max': 125   # Maximum value
        },
        'lstm_units': {
            'distribution': 'int_uniform',  # Specifies that the values should be integers
            'min': 100,  # Minimum value
            'max': 110   # Maximum value
        },
        'dense_units': {
            'distribution': 'int_uniform',  # Specifies that the values should be integers
            'min': 142,  # Minimum value
            'max': 152   # Maximum value
        },
        'lstm_dropout_rate': {
            'min': 0.0,
            'max': 0.04
        },
        'lstm_r_dropout_rate': {
            'min': 0.01,
            'max': 0.06
        },
        'learning_rate': {
            'distribution': 'log_uniform',
            'min': -9.21,  # log(1e-4)
            'max': -4.61   # log(1e-2)
        },
        'dropout_rate': {
            'min': 0.0,
            'max': 0.1
        }
    }
}

input_length = 20
num_classes = 1
parallel_blocks = 2


# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="text_classification")

# Function to train the model
def train():
    # Initialize a new wandb run
    wandb.init(reinit=True)
    
    hypermodel = TextClassifier_WB(input_length, embeddings_GloVe, num_classes, parallel_blocks)
    model = hypermodel.build()
    
    # Train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, callbacks=[WandbCallback()])
    wandb.finish()

# Run the sweep
wandb.agent(sweep_id, train)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mteodor-ruskvi[0m ([33mteodor_ruskvi[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777609622, max=1.0…



Create sweep with ID: q08mwc1p
Sweep URL: https://wandb.ai/teodor_ruskvi/text_classification/sweeps/q08mwc1p


Exception in thread Exception in thread ChkStopThrIntMsgThr:
:
Traceback (most recent call last):
Traceback (most recent call last):
  File "c:\Users\bugat\anaconda3\Lib\threading.py", line 1045, in _bootstrap_inner
  File "c:\Users\bugat\anaconda3\Lib\threading.py", line 1045, in _bootstrap_inner
[34m[1mwandb[0m: Agent Starting Run: zk78vq6k with config:
    [34m[1mwandb[0m: 	conv_filter_units: 30
self.run()    
self.run()
  File "c:\Users\bugat\anaconda3\Lib\threading.py", line 982, in run
  File "c:\Users\bugat\anaconda3\Lib\threading.py", line 982, in run
[34m[1mwandb[0m: 	dense_lstm_units: 124
[34m[1mwandb[0m: 	dense_units: 143
    self._target(*self._args, **self._kwargs)
  File "c:\Users\bugat\anaconda3\Lib\site-packages\wandb\sdk\wandb_run.py", line 300, in check_internal_messages
    self._target(*self._args, **self._kwargs)
  File "c:\Users\bugat\anaconda3\Lib\site-packages\wandb\sdk\wandb_run.py", line 286, in check_stop_status
    self._loop_check_status(
  File






Epoch 1/5




In [None]:
best_model.save(r'C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\models\Beast_model')

INFO:tensorflow:Assets written to: C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\models\Beast_model\assets


INFO:tensorflow:Assets written to: C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\models\Beast_model\assets


In [None]:
best_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 20)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 20, 100)              2223500   ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 20, 31)               3131      ['embedding[0][0]']           
                                                                                                  
 conv1d_1 (Conv1D)           (None, 20, 31)               3131      ['embedding[0][0]']           
                                                                                              

In [None]:
n_splits=5

KF = KFold(n_splits=n_splits, shuffle=True, random_state=42) # Example: 5-fold cross-validation

# Prepare arrays to store results for each fold
fold_no = 1
loss_per_fold = []
acc_per_fold = []

for train, test in KF.split(X_train_LSTM, y_train_LSTM):
    # Create a fresh model for each fold

    # Fit the model
    best_model.fit(X_train_LSTM[train], y_train_LSTM[train],
                        epochs=n_splits,  # Adjust based on your needs
                        batch_size=64,
                        callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)],
                        validation_data=(X_train_LSTM[test], y_train_LSTM[test]),
                        verbose=1)  # You can set verbose to 0 to reduce logs

    # Evaluate the model
    scores = best_model.evaluate(X_train_LSTM[test], y_train_LSTM[test], verbose=0)
    print(f'Score for fold {fold_no}: {best_model.metrics_names[0]} of {scores[0]}; {best_model.metrics_names[1]} of {scores[1]*100}%')
    loss_per_fold.append(scores[0])
    acc_per_fold.append(scores[1] * 100)
    fold_no += 1

# Print average scores
print(f'Average scores for all folds:\n> Loss: {np.mean(loss_per_fold)}; Accuracy: {np.mean(acc_per_fold)}%')