In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

**Importing Data**

In [2]:

df = pd.read_csv('c:/Users/migle/Desktop/BPR/IPWFormAi/data/all_data.csv', on_bad_lines='skip')

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','customform','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
display(df)

Unnamed: 0,name,fieldlabel
0,Kundereklamation,Vælg kunde
2,Kundereklamation,Gadenavn
3,Kundereklamation,Postnr
4,Kundereklamation,By
5,Kundereklamation,Telefonnummer
...,...,...
10330,Intern IT Support,"Registrerer du tid, kan du bruge flg. opg.nr: ..."
10331,Azure - Udløb af client secret,Ansvarlig for opsætning
10333,Leverancer,Installationen oprettes på domænet xxxx.ipw.dk
10334,Leverancer,Installationen oprettes på domænet <b>xxxx.ipw...


**Tokenize form names**

In [3]:
form_name = df['name']
form_elements = df['fieldlabel']

In [4]:
grouped = df.groupby('name')['fieldlabel'].apply(list).reset_index()

# Step 1: Fit MultiLabelBinarizer on the entire dataset's field labels
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(grouped['fieldlabel'])

# Step 2: Flatten and tokenize the field labels
flattened_elements = [item for sublist in grouped['fieldlabel'] for item in sublist]
element_tokenizer = Tokenizer()
element_tokenizer.fit_on_texts(flattened_elements)

# Tokenize field labels for each form
element_sequences = [element_tokenizer.texts_to_sequences(elements) for elements in grouped['fieldlabel']]

# Step 3: Tokenize form names
form_tokenizer = Tokenizer()
form_tokenizer.fit_on_texts(grouped['name'])  # Tokenize form names
form_sequences = form_tokenizer.texts_to_sequences(grouped['name'])

# Step 4: Pad the sequences to a consistent length for form names
max_name_length = max(len(seq) for seq in form_sequences)
X_form_names = pad_sequences(form_sequences, maxlen=max_name_length, padding="post")

# Step 5: Find the maximum number of field labels (pad to the same number of labels)
max_field_labels = max(len(elements) for elements in grouped['fieldlabel'])
max_element_length = max(len(seq) for elements in element_sequences for seq in elements)

# Step 6: Pad the sequences of field labels for each form
X_form_elements = []

for elements in element_sequences:
    # Pad each form's field labels to the same number of labels and same sequence length
    padded_elements = pad_sequences(elements, maxlen=max_element_length, padding="post", truncating="post")
    # Ensure each form has the same number of field labels
    padded_elements = np.pad(padded_elements, ((0, max_field_labels - padded_elements.shape[0]), (0, 0)), mode='constant')
    X_form_elements.append(padded_elements)

# Convert to numpy array
X_form_elements = np.array(X_form_elements)

# Check final shapes
print("X_form_names shape:", X_form_names.shape)
print("X_form_elements shape:", X_form_elements.shape)
print(f"y_binary shape: {y_binary.shape}")

X_form_names shape: (837, 9)
X_form_elements shape: (837, 254, 48)
y_binary shape: (837, 4596)


X_form_names shape: (837, 9) --> 837 unique form names, and each form name has been tokenized and padded to a length of 9 tokens.

X_form_elements shape: (837, 254, 48) --> 837 forms, each with up to 254 field labels, and each field label sequence is padded to a length of 48 tokens.

In [5]:

# Flatten to fit the encoder, then reshape back after transformation
X_form_elements_flat = X_form_elements.reshape(-1, X_form_elements.shape[-1])
y_binary = mlb.fit_transform(X_form_elements_flat).reshape(X_form_elements.shape[0], X_form_elements.shape[1], -1)

# Pad X_form_names to match the sequence length of y_train (254 timesteps)
X_form_names_padded = pad_sequences(X_form_names, maxlen=254, padding="post")
X_form_names_padded = X_form_names_padded.squeeze()
# Reshape to ensure the input to the model matches the expected 3D shape


#Data split
X_train, X_test, y_train, y_test = train_test_split(X_form_names_padded, y_binary, test_size=0.2, random_state=42, shuffle=True)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (669, 254), X_test shape: (168, 254)
y_train shape: (669, 254, 4316), y_test shape: (168, 254, 4316)


**Define Model Architecture - LSTM** 

Firstly, we will use LSTM as it is known for capturing more complex relations in the dataset.

In [6]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(669, 254)
(168, 254)
(669, 254, 4316)
(168, 254, 4316)


**Creating and training the RNN model**

Sequence to vector, where vector will then be used as input to Multioutput Classifier

In [7]:
y_train_flat = np.max(y_train, axis=1)  # Shape: (samples, 4316)
y_test_flat = np.max(y_test, axis=1)

print(y_train_flat.shape)
print(y_test_flat.shape)



(669, 4316)
(168, 4316)


PCA was introduced to align the target dimensionality with the RNN output.

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=64)
y_train_pca = pca.fit_transform(y_train_flat)
y_test_pca = pca.transform(y_test_flat)
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_)}")

Explained variance ratio: 0.6668714629874055


We are using RNN for the purpose to output meaningful vector representations for the input sequences. These embeddings then will be used in Multioutput Classifier to perform binary classification.

In [9]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization

# Define the RNN model with multiple outputs
def create_multi_output_rnn(input_dim, embedding_dim, input_length):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length, mask_zero=True),
        LSTM(64, return_sequences=False, kernel_regularizer=l2(0.01)),
        Dropout(0.3),
        BatchNormalization()
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define model parameters
input_dim = len(element_tokenizer.word_index) + 1
embedding_dim = 100
input_length = 254


# Build the model
multi_output_rnn_model = create_multi_output_rnn(input_dim, embedding_dim, input_length)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-5)




In [10]:
# Train the model
multi_output_rnn_model.fit(
    X_train, y_train_pca,
    epochs=10,  
    batch_size=32,
    validation_data=(X_test, y_test_pca),
    shuffle=True,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.0133 - loss: 1.4156 - val_accuracy: 0.0298 - val_loss: 0.9318 - learning_rate: 0.0010
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.0187 - loss: 0.9700 - val_accuracy: 0.0119 - val_loss: 0.6443 - learning_rate: 0.0010
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.0254 - loss: 0.6766 - val_accuracy: 0.0060 - val_loss: 0.4766 - learning_rate: 0.0010
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.0298 - loss: 0.4864 - val_accuracy: 0.0060 - val_loss: 0.3916 - learning_rate: 0.0010
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.0328 - loss: 0.3941 - val_accuracy: 0.0060 - val_loss: 0.3393 - learning_rate: 0.0010
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

<keras.src.callbacks.history.History at 0x28e2577dd10>

**Predictions**

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
import numpy as np

# Preprocess the form name to prepare it as input for the RNN encoder
def preprocess_form_name(form_name):
    # Tokenize and pad the input form name
    tokenized_form_name = form_tokenizer.texts_to_sequences([form_name])
    padded_form_name = pad_sequences(tokenized_form_name, maxlen=254, padding="post")
    return padded_form_name  # Shape: (1, 254)

# Generate feature vectors from the RNN model for each form name
X_form_vectors = np.array([
    multi_output_rnn_model.predict(preprocess_form_name(name)).squeeze()
    for name in df['name']
])

# Check shape of extracted embeddings
print(f"Shape of RNN embeddings: {X_form_vectors.shape}") 



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [13]:
# Extract RNN embeddings only for training and testing splits
# Pass tokenized and padded data directly
X_train_vectors = multi_output_rnn_model.predict(X_train)
X_test_vectors = multi_output_rnn_model.predict(X_test)


# Verify shapes
print(f"X_train_vectors shape: {X_train_vectors.shape}")  # Should match (669, 64)
print(f"X_test_vectors shape: {X_test_vectors.shape}")    # Should match (168, 64)
print(f"y_train_flat shape: {y_train_flat.shape}")       # Should match (669, 4316)
print(f"y_test_flat shape: {y_test_flat.shape}")         # Should match (168, 4316)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
X_train_vectors shape: (669, 64)
X_test_vectors shape: (168, 64)
y_train_flat shape: (669, 4316)
y_test_flat shape: (168, 4316)


In [14]:
# Check for columns with only one unique value
constant_labels = [i for i in range(y_train_flat.shape[1]) if len(np.unique(y_train_flat[:, i])) == 1]

print(f"Constant labels (indices): {constant_labels}")
print(f"Number of constant labels: {len(constant_labels)}")

Constant labels (indices): [0, 76, 201, 291, 308, 312, 394, 403, 455, 456, 457, 458, 576, 583, 584, 649, 681, 682, 684, 685, 708, 785, 795, 796, 797, 810, 833, 834, 872, 873, 948, 952, 995, 996, 997, 1016, 1024, 1025, 1028, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1045, 1046, 1047, 1048, 1091, 1093, 1094, 1221, 1226, 1227, 1233, 1235, 1242, 1311, 1314, 1315, 1316, 1317, 1325, 1341, 1342, 1365, 1374, 1376, 1377, 1378, 1380, 1381, 1382, 1393, 1394, 1403, 1407, 1470, 1471, 1472, 1473, 1476, 1477, 1478, 1481, 1482, 1532, 1623, 1698, 1701, 1712, 1713, 1714, 1719, 1728, 1736, 1755, 1797, 1798, 1799, 1815, 1816, 1817, 1830, 1866, 1867, 1882, 1883, 1884, 1885, 1886, 1887, 1903, 1904, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1974, 1976, 2012, 2013, 2023, 2033, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 20

The output confirms that there are 701 constant labels in our dataset, where each label has the same value (0 or 1) for all samples in y_train_flat. These labels cannot contribute to the learning process and should be removed from both the training and testing datasets.

In [15]:
# Remove constant labels from y_train_flat and y_test_flat
y_train_flat_filtered = np.delete(y_train_flat, constant_labels, axis=1)
y_test_flat_filtered = np.delete(y_test_flat, constant_labels, axis=1)

# Verify new shapes
print(f"New y_train_flat shape: {y_train_flat_filtered.shape}")  # Should have 4316 - 701 = 3615 labels
print(f"New y_test_flat shape: {y_test_flat_filtered.shape}")    # Same reduction


New y_train_flat shape: (669, 3615)
New y_test_flat shape: (168, 3615)


This approach ensures that your classifier handles only meaningful labels while maintaining compatibility with the original dataset structure

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train MultiOutputClassifier
multi_output_clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=25, class_weight='balanced', random_state=42))
multi_output_clf.fit(X_train_vectors, y_train_flat_filtered)

# Evaluate on test data
test_score = multi_output_clf.score(X_test_vectors, y_test_flat_filtered)
print(f"Test Score: {test_score}")

Test Score: 0.047619047619047616


**Finetuning hyperparamters**

Lets start with RandomizedSearchCV first using LGBM classifier that is fast with large datasets and works efficiently with dense embeddings.

In [18]:
pip install catboost tqdm


Note: you may need to restart the kernel to use updated packages.


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from tqdm import tqdm

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [-1, 10, 20],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__num_leaves': [31, 50, 100],
    'estimator__subsample': [0.5, 0.7, 1.0],
    'estimator__colsample_bytree': [0.5, 0.7, 1.0],
}

# Initialize LightGBM model
lgbm_model = MultiOutputClassifier(LGBMClassifier(random_state=42, verbose=-1))

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring='accuracy',
    cv=3,
    verbose=0,  # Turn off default verbosity
    random_state=42,
    n_jobs=-1  # Use all CPU cores for search
)

# Wrap RandomizedSearchCV with tqdm
def tqdm_search_cv(search, X, y):
    with tqdm(total=search.n_iter, desc="RandomizedSearchCV Progress") as pbar:
        search.fit(X, y)
        pbar.update(1)

# Run the search
tqdm_search_cv(random_search, X_train_vectors, y_train_flat_filtered)

# Display best parameters and score
print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validated score: {random_search.best_score_}")


RandomizedSearchCV Progress:   0%|          | 0/50 [00:00<?, ?it/s]

**Testing**

In [None]:
def predict_field_labels(form_name):
    # Preprocess the form name
    form_vector = multi_output_rnn_model.predict(preprocess_form_name(form_name)).squeeze().reshape(1, -1)
    
    # Predict binary labels using the classifier
    predicted_labels = multi_output_clf.predict(form_vector)
    
    # Map binary predictions back to field labels
    predicted_fieldlabels = mlb.inverse_transform(predicted_labels)
    return predicted_fieldlabels

# Example usage
form_name = "Kundereklamation"
predicted_fieldlabels = predict_field_labels(form_name)
print(f"Predicted Field Labels: {predicted_fieldlabels}")
