In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

2024-11-24 22:21:12.782805: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-24 22:21:12.791027: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-24 22:21:12.813678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-24 22:21:12.851009: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-24 22:21:12.861306: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 22:21:12.898665: I tensorflow/core/platform/cpu_feature_gu

# Data Processing

This section outlines the steps involved in processing the data before further analysis or modeling.


1. General Data Processing.

2. Convert the following fields to lowercase for uniformity:
   - **Name**
   - **fieldlabel**

   This ensures that comparisons are case-insensitive.

## Grouping Forms

3. Group the forms based on:
   - **Form ID**(customform)
   - **name**


In [2]:

df = pd.read_csv('data/all_data.csv', on_bad_lines='skip')

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
df['fieldlabel'] = df['fieldlabel'].apply(str.lower)
df['name'] = df['name'].apply(str.lower)
data = df.groupby(['customform','name'])['fieldlabel'].apply(list).reset_index()
display(data)

Unnamed: 0,customform,name,fieldlabel
0,770,projektportefølje,"[projekt nr., oprettet den, oprettet af, proje..."
1,775,projektområde,[tekst]
2,876,claims,"[processor, description of claim, complaint ca..."
3,876,kundereklamation,"[vælg kunde, gadenavn, postnr, by, telefonnumm..."
4,876,reklamation,"[vælg kunde, e-mail, kontaktperson, behandles ..."
...,...,...,...
854,2584010,formularkatalog,"[oprettet af, oprettet, blank linje, seneste æ..."
855,2584018,formularkatalog - kategori,[kategori]
856,2584251,formularkatalog - øvrige vurderinger,"[hvad gør formularen god?, vurderet af, vurder..."
857,2584255,formularkatalog - vurderingsskala,[vurdering]


# Prepare Data for Training

## Create a Mapping Between Label and Index

1. **Label to Index Mapping**:  
   Create a mapping between the **field label** and a unique index. This allows us to efficiently handle categorical data during training.

## Create Encoding for "Field Label"

2. **Field Label Encoding**:  
   Encode the **"fieldlabel"** by counting how many times each label appears within a form. This gives us a numeric representation of label frequency in the form.

## Create Embedding for "Form Name"

3. **Form Name Embedding**:  
   Create an embedding for the **"form name"** using Tokenizer This will convert the form name into a dense vector representation.

## Pad the Embedding

4. **Padding the Embedding**:  
   To ensure uniformity in input length, **pad the embeddings** so that all embeddings have the same length. This is necessary for feeding the data into a model.

## Split the Data for Testing

5. **Testing Data Split**:  
   Split the dataset, keeping the last **100 forms** aside for testing purposes. This will allow us to evaluate the performance of the model on unseen data.



In [3]:

# Get all unique labels
all_labels = list(set(label for labels in data['fieldlabel'] for label in labels))

# Map labels to indices
label_to_index = {label: i for i, label in enumerate(all_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

# Total number of examples
total_samples = len(data)

# Split index (last 100 for testing, adjust for your data size)
split_index = max(0, total_samples - 100)

# Training and test split
train_data = data.iloc[:split_index]  # First part for training
test_data = data.iloc[split_index:]  # Last 100 for testing

# Prepare training data
X_train_names = train_data['name'].tolist()
Y_train_labels = np.zeros((len(train_data), len(all_labels)), dtype=np.float32)

#set one-hot encoding for field labels
for i, labels in enumerate(train_data['fieldlabel']):
    Y_train_labels[i, [label_to_index[label] for label in labels]] = 1

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_names) 
X_train_sequences = tokenizer.texts_to_sequences(X_train_names)
max_name_len = max(len(seq) for seq in X_train_sequences)  
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_name_len, padding='post')


# Prepare test data
X_test_names = test_data['name'].tolist()
Y_test_labels = np.zeros((len(test_data), len(all_labels)), dtype=np.float32)

for i, labels in enumerate(test_data['fieldlabel']):
    for label in labels:
        Y_test_labels[i, label_to_index[label]] = 1

X_test_sequences = tokenizer.texts_to_sequences(X_test_names)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_name_len, padding='post')


## Model Architecture

The model is composed of two main components:

### 1. LSTM (Sequence to Vector)

- **Purpose**: The first part of the model uses an **LSTM** (Long Short-Term Memory) network. This component is designed to process the sequence of embedded form names and convert them into a vector representation.
  
- **Input**: The input to the LSTM is the **embedded form name**, which has been preprocessed and embedded into a vector format.
  
- **Output**: The output of the LSTM is a dense vector that represents the form, capturing the sequence of the form name and its contextual information.

### 2. Multi-output Classifier

- **Purpose**: The second part of the model is a **multi-output classifier**. This classifier predicts the probability of the appearance of each label in a given form.


In [15]:
vocab_size = len(tokenizer.word_index) + 1  # Include padding
embedding_dim = 128  # Dimension of embedding space
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

name_input = Input(shape=(max_name_len,))

# Embedding layer with masking enabled
x = Embedding(vocab_size, embedding_dim, mask_zero=True)(name_input)

# LSTM layer for sequence-to-vector encoding
x = LSTM(32, return_sequences=False)(x)

# Output layer for multi-label classification
output = Dense(len(all_labels), activation='sigmoid')(x)

# Build the model
model = Model(name_input, output)

# Compile the model with an optimizer and loss function
model.compile(optimizer= optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, Y_train_labels, epochs=20, batch_size=32)


Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.0156 - loss: 0.2738
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.0049 - loss: 0.0369
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0132 - loss: 0.0221
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.0274 - loss: 0.0161
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.0157 - loss: 0.0139
Epoch 6/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0146 - loss: 0.0120
Epoch 7/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0170 - loss: 0.0117
Epoch 8/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0149 - loss: 0.0120
Epoch 9/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f49b8527c70>

## Testing

In [16]:
test_loss, test_accuracy = model.evaluate(X_test_padded, Y_test_labels, batch_size=32)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0000e+00 - loss: 0.0674 
Test Loss: 0.0702538788318634, Test Accuracy: 0.0


## Predictions
print out the labels with more that 40% chance to appeare

In [20]:
# After training, predict using a test book name
form_name = "afvigelse"
name_seq = tokenizer.texts_to_sequences([form_name])
name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')

predictions = model.predict(name_padded)
predicted_labels = (predictions > 0.4).astype(int)  # Apply a threshold to get binary predictions

# Convert indices back to capture names
predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]

print(f"Predicted Captures for '{form_name}': {predicted_labels}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Predicted Captures for 'afvigelse': ['nummer', 'oprettet', 'ansvarlig', 'oprettet af', 'projekt nr', 'beskrivelse af afvigelsen', 'afvigelsen skete den']
