In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, concatenate





In [24]:
# Load the dataset to see its structure
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data + 'new_df.csv')

# Last inn 'X_train_LSTM' fra en CSV-fil
X_df = pd.read_csv(url_data+'X_tensorflow.csv')
# Konverter hele DataFrame til et NumPy array
X = X_df.to_numpy()

X_txt = pd.read_csv(url_data + 'X_text.csv')
X_txt = X_txt.to_numpy()
# Last inn 'y_train_LSTM' fra en CSV-fil
y_df = pd.read_csv(url_data+'y_liberal.csv')
# Konverter hele DataFrame til et NumPy array
y = y_df.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()

print('Shape of label tensor:', y.shape)
print('Shape of X:', X.shape)
print('Shape of embeddings_GloVe:', embeddings_GloVe.shape)

Shape of label tensor: (12854, 1)
Shape of X: (12854, 20)
Shape of embeddings_GloVe: (22235, 100)


In [3]:
# Defining pre-processing hyperparameters
max_len = 20
trunc_type = "post"
padding_type = "post"
vocab_size = len(embeddings_GloVe)
# This is fixed.
embedding_dim = 100
num_classes = 1

In [4]:
POS_tags = df['POS_Tags'].to_list()

# Initialize the tokenizer
POS_tokenizer = Tokenizer()

# Fit the tokenizer on the POS tags
POS_tokenizer.fit_on_texts(POS_tags)

# Convert POS tags to sequences
POS_sequences = POS_tokenizer.texts_to_sequences(POS_tags)

# Pad the sequences
POS_padded= pad_sequences(POS_sequences, maxlen=max_len, padding='post')

POS_dict = POS_tokenizer.word_index
POS_size = len(POS_dict)
POS_size

17

In [12]:
dep_tags = df['Dependency_Tags'].to_list()

Dep_tokenizer = Tokenizer()

# Fit the tokenizer on the POS tags
Dep_tokenizer.fit_on_texts(dep_tags)

# Convert POS tags to sequences
Dep_sequences = Dep_tokenizer.texts_to_sequences(dep_tags)

# Pad the sequences
Dep_padded= pad_sequences(Dep_sequences, maxlen=max_len, padding='post')

Dep_dict = Dep_tokenizer.word_index
Dep_size = len(Dep_dict)
Dep_size

42

In [None]:
Dep_padded

In [6]:
X_train_text, X_test_text, POS_train, POS_test, Dep_train, Dep_test, y_train, y_test = train_test_split(X, POS_padded, Dep_padded, y, test_size=0.7, random_state=42)
X_val_text, X_test_text, POS_val, POS_test, Dep_val, Dep_test, y_val, y_test = train_test_split(X_test_text, POS_test, Dep_test, y_test, test_size=0.7, random_state=42)

In [23]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Dense, Bidirectional, LSTM, AdditiveAttention, GlobalAveragePooling1D, Dropout, concatenate
from tensorflow.keras.models import Model

# Text pathway
text_input = Input(shape=(max_len,), dtype='int32', name='text_input')
text_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embeddings_GloVe], trainable=False)(text_input)
text_spatial_dropout = SpatialDropout1D(0.2)(text_embedding)
text_conv1d = Conv1D(filters=60, kernel_size=1, activation='relu')(text_spatial_dropout)
text_max_pooling = MaxPooling1D()(text_conv1d)
text_conv_dense = Dense(100, activation='relu')(text_max_pooling)
text_lstm = Bidirectional(LSTM(64, activation='tanh', recurrent_activation='sigmoid', recurrent_dropout=0.1, return_sequences=True))(text_conv_dense)

# POS pathway
pos_input = Input(shape=(max_len,), dtype='int32', name='pos_input')
pos_embedding = Embedding(input_dim=POS_size+1, output_dim=10)(pos_input)
pos_spatial_dropout = SpatialDropout1D(0.2)(pos_embedding)
pos_conv1d = Conv1D(filters=20, kernel_size=1, activation='relu')(pos_spatial_dropout)
pos_max_pooling = MaxPooling1D()(pos_conv1d)
pos_conv_dense = Dense(50, activation='relu')(pos_max_pooling)
pos_lstm = Bidirectional(LSTM(20, activation='tanh', recurrent_activation='sigmoid', recurrent_dropout=0.1, return_sequences=True))(pos_conv_dense)

# Dep pathway
dep_input = Input(shape=(max_len,), dtype='int32', name='dep_input')
dep_embedding = Embedding(input_dim=Dep_size+1, output_dim=10)(dep_input)
dep_spatial_dropout = SpatialDropout1D(0.2)(dep_embedding)
dep_conv1d = Conv1D(filters=20, kernel_size=1, activation='relu')(dep_spatial_dropout)
dep_max_pooling = MaxPooling1D()(dep_conv1d)
dep_conv_dense = Dense(50, activation='relu')(dep_max_pooling)
dep_lstm = Bidirectional(LSTM(20, activation='tanh', recurrent_activation='sigmoid', recurrent_dropout=0.1, return_sequences=True))(dep_conv_dense)

# Combine pathways
combined = concatenate([text_lstm, pos_lstm, dep_lstm])

# Attention
attention_layer = AdditiveAttention(use_scale=False)
attention_output = attention_layer([combined, combined], return_attention_scores=False)
attention_output = GlobalAveragePooling1D()(attention_output)

# Final dense layers
dense_relu = Dense(100, activation='relu')(attention_output)
dropout = Dropout(0.20)(dense_relu)
output_layer = Dense(num_classes, activation='sigmoid')(dropout)

# Create and compile model
model = Model(inputs=[text_input, pos_input, dep_input], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_input (InputLayer)     [(None, 20)]                 0         []                            
                                                                                                  
 pos_input (InputLayer)      [(None, 20)]                 0         []                            
                                                                                                  
 dep_input (InputLayer)      [(None, 20)]                 0         []                            
                                                                                                  
 embedding_15 (Embedding)    (None, 20, 100)              2223500   ['text_input[0][0]']          
                                                                                            

In [9]:
# model = model_build(vocab_size=vocab_size, max_len=max_len, embedding_dim=embedding_dim, embeddings=embeddings_GloVe, 
                    # POS_size=POS_size, Dep_size=Dep_size, num_classes=num_classes)


In [21]:
EPOCHS=20
BATCH_SIZE = 32


early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)


history = model.fit([X_train_text, POS_train, Dep_train], y_train,
                    epochs=EPOCHS,
                    validation_data=([X_val_text, POS_val, Dep_val], y_val),
                    batch_size=BATCH_SIZE,
                    callbacks=[early_stopping_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [22]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([X_test_text, POS_test, Dep_test], y_test, batch_size=BATCH_SIZE)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 0.5533388257026672
Test Accuracy: 0.7315446734428406
