In [43]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

**Importing Data**

In [128]:

df = pd.read_csv('c:/Users/migle/Desktop/BPR/IPWFormAi/data/all_data.csv', on_bad_lines='skip')

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','customform','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
display(df)

Unnamed: 0,name,fieldlabel
0,Kundereklamation,Vælg kunde
2,Kundereklamation,Gadenavn
3,Kundereklamation,Postnr
4,Kundereklamation,By
5,Kundereklamation,Telefonnummer
...,...,...
10330,Intern IT Support,"Registrerer du tid, kan du bruge flg. opg.nr: ..."
10331,Azure - Udløb af client secret,Ansvarlig for opsætning
10333,Leverancer,Installationen oprettes på domænet xxxx.ipw.dk
10334,Leverancer,Installationen oprettes på domænet <b>xxxx.ipw...


**Tokenize form names**

In [129]:
form_name = df['name']
form_elements = df['fieldlabel']

In [130]:
grouped = df.groupby('name')['fieldlabel'].apply(list).reset_index()

# Step 1: Fit MultiLabelBinarizer on the entire dataset's field labels
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(grouped['fieldlabel'])

# Step 2: Flatten and tokenize the field labels
flattened_elements = [item for sublist in grouped['fieldlabel'] for item in sublist]
element_tokenizer = Tokenizer()
element_tokenizer.fit_on_texts(flattened_elements)

# Tokenize field labels for each form
element_sequences = [element_tokenizer.texts_to_sequences(elements) for elements in grouped['fieldlabel']]

# Step 3: Tokenize form names
form_tokenizer = Tokenizer()
form_tokenizer.fit_on_texts(grouped['name'])  # Tokenize form names
form_sequences = form_tokenizer.texts_to_sequences(grouped['name'])

# Step 4: Pad the sequences to a consistent length for form names
max_name_length = max(len(seq) for seq in form_sequences)
X_form_names = pad_sequences(form_sequences, maxlen=max_name_length, padding="post")

# Step 5: Find the maximum number of field labels (pad to the same number of labels)
max_field_labels = max(len(elements) for elements in grouped['fieldlabel'])
max_element_length = max(len(seq) for elements in element_sequences for seq in elements)

# Step 6: Pad the sequences of field labels for each form
X_form_elements = []

for elements in element_sequences:
    # Pad each form's field labels to the same number of labels and same sequence length
    padded_elements = pad_sequences(elements, maxlen=max_element_length, padding="post", truncating="post")
    # Ensure each form has the same number of field labels
    padded_elements = np.pad(padded_elements, ((0, max_field_labels - padded_elements.shape[0]), (0, 0)), mode='constant')
    X_form_elements.append(padded_elements)

# Convert to numpy array
X_form_elements = np.array(X_form_elements)

# Check final shapes
print("X_form_names shape:", X_form_names.shape)
print("X_form_elements shape:", X_form_elements.shape)
print(f"y_binary shape: {y_binary.shape}")

X_form_names shape: (837, 9)
X_form_elements shape: (837, 254, 48)
y_binary shape: (837, 4596)


X_form_names shape: (837, 9) --> 837 unique form names, and each form name has been tokenized and padded to a length of 9 tokens.

X_form_elements shape: (837, 254, 48) --> 837 forms, each with up to 254 field labels, and each field label sequence is padded to a length of 48 tokens.

In [131]:

# Flatten to fit the encoder, then reshape back after transformation
X_form_elements_flat = X_form_elements.reshape(-1, X_form_elements.shape[-1])
y_binary = mlb.fit_transform(X_form_elements_flat).reshape(X_form_elements.shape[0], X_form_elements.shape[1], -1)

# Pad X_form_names to match the sequence length of y_train (254 timesteps)
X_form_names_padded = pad_sequences(X_form_names, maxlen=254, padding="post")
X_form_names_padded = X_form_names_padded.squeeze()
# Reshape to ensure the input to the model matches the expected 3D shape


#Data split
X_train, X_test, y_train, y_test = train_test_split(X_form_names_padded, y_binary, test_size=0.2, random_state=42, shuffle=True)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (669, 254), X_test shape: (168, 254)
y_train shape: (669, 254, 4316), y_test shape: (168, 254, 4316)


**Define Model Architecture - LSTM** 

Firstly, we will use LSTM as it is known for capturing more complex relations in the dataset.

In [132]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(669, 254)
(168, 254)
(669, 254, 4316)
(168, 254, 4316)


**Creating and training the RNN model**

Sequence to vector, where vector will then be used as input to Multioutput Classifier

In [133]:
y_train_flat = np.max(y_train, axis=1)  # Shape: (samples, 4316)
y_test_flat = np.max(y_test, axis=1)

In [134]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization

# Define the RNN model with multiple outputs
def create_multi_output_rnn(input_dim, embedding_dim, input_length, output_dim):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length, mask_zero=True),
        LSTM(16, return_sequences=False),
        BatchNormalization(),
        Dropout(0.4),
        Dense(output_dim, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define model parameters
input_dim = len(element_tokenizer.word_index) + 1
embedding_dim = 100
input_length = 254
output_dim = y_train.shape[2]  # Dynamically set based on the binarized y_train

# Build the model
multi_output_rnn_model = create_multi_output_rnn(input_dim, embedding_dim, input_length, output_dim)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)




In [135]:
# Train the model
multi_output_rnn_model.fit(
    X_train, y_train_flat,
    epochs=10,  
    batch_size=32,
    validation_data=(X_test, y_test_flat),
    shuffle=True,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - accuracy: 0.0146 - loss: 0.6906 - val_accuracy: 1.0000 - val_loss: 0.6821 - learning_rate: 0.0010
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.2310 - loss: 0.6794 - val_accuracy: 1.0000 - val_loss: 0.6680 - learning_rate: 0.0010
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.6014 - loss: 0.6641 - val_accuracy: 1.0000 - val_loss: 0.6467 - learning_rate: 0.0010
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.7656 - loss: 0.6406 - val_accuracy: 1.0000 - val_loss: 0.6141 - learning_rate: 0.0010
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.8795 - loss: 0.6016 - val_accuracy: 1.0000 - val_loss: 0.5678 - learning_rate: 0.0010
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

<keras.src.callbacks.history.History at 0x1e455a58690>

**Predictions**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import numpy as np

# Preprocess the form name to prepare it as input for the RNN encoder
def preprocess_form_name(form_name):
    # Tokenize and pad the input form name
    tokenized_form_name = form_tokenizer.texts_to_sequences([form_name])
    padded_form_name = pad_sequences(tokenized_form_name, maxlen=254, padding="post")
    return padded_form_name  # Shape: (1, 254)

# Generate feature vectors from the RNN model for each form name
X_form_vectors = np.array([
    multi_output_rnn_model.predict(preprocess_form_name(name)).squeeze()
    for name in df['name']
])

# Reshape `y_train` and `y_test` to be 2D for MultiOutputClassifier
num_samples, num_timesteps, num_labels = y_train.shape
y_train_flat = y_train.reshape(num_samples, num_timesteps * num_labels)
y_test_flat = y_test.reshape(y_test.shape[0], y_test.shape[1] * y_test.shape[2])

# Initialize and train MultiOutputClassifier
multi_output_clf = MultiOutputClassifier(RandomForestClassifier())
multi_output_clf.fit(X_train, y_train_flat)

# Example of predicting on a single input
form_name_input = "kundereklamation"
processed_form_name = preprocess_form_name(form_name_input)
feature_vector = multi_output_rnn_model.predict(processed_form_name).squeeze()

# Make predictions with the MultiOutputClassifier
predicted_elements_binary = multi_output_clf.predict([feature_vector])

# Convert the binary output to readable labels
predicted_labels = mlb.inverse_transform(predicted_elements_binary)

# Display the predicted elements for the input form name
print(f"Predicted elements for form '{form_name_input}':")
for label in predicted_labels[0]:
    print(label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19

**Checking if everything is working**

In [126]:
# Test preprocessing independently
form_name_input = "kundereklamation"
processed_form_name = preprocess_form_name(form_name_input)
print(f"Processed form name shape: {processed_form_name.shape}")
print(f"Processed form name content: {processed_form_name}")


Processed form name shape: (1, 254)
Processed form name content: [[97  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [122]:
# Generate feature vector using the trained RNN encoder
form_feature_vector = multi_output_rnn_model.predict(processed_form_name)
print(f"Feature vector shape: {form_feature_vector.shape}")
print(f"Feature vector content (first 5 values): {form_feature_vector[0][:5]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Feature vector shape: (1, 4316)
Feature vector content (first 5 values): [0.7750282  0.33031815 0.2771711  0.2946917  0.3008161 ]


In [123]:
# Predict binary presence values for each field label using MultiOutputClassifier
predicted_elements_binary = multi_output_clf.predict(form_feature_vector)
print(f"Predicted binary output shape: {predicted_elements_binary.shape}")
print(f"Predicted binary output content: {predicted_elements_binary[0]}")


Predicted binary output shape: (1, 2)
Predicted binary output content: [1 1]


In [124]:
# Verify field labels
field_labels = mlb.classes_
print(f"Field labels: {field_labels}")
print(f"Number of field labels: {len(field_labels)}")


Field labels: [0 1]
Number of field labels: 2


In [127]:
# Map each label to its binary prediction and display
print(f"Predicted elements for form '{form_name_input}':")
for label, is_present in zip(field_labels, predicted_elements_binary[0]):
    print(f"{label}: {'Present' if is_present == 1 else 'Not Present'}")


Predicted elements for form 'kundereklamation':
0: Present
1: Present
