In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Data Processing

This section outlines the steps involved in processing the data before further analysis or modeling.


1. General Data Processing.

2. Convert the following fields to lowercase for uniformity:
   - **Name**
   - **fieldlabel**

   This ensures that comparisons are case-insensitive.

## Grouping Forms

3. Group the forms based on:
   - **Form ID**(customform)
   - **name**


In [3]:

df = pd.read_csv('data/all_datav3.csv', delimiter=';', on_bad_lines='skip')
print(df)

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
df['fieldlabel'] = df['fieldlabel'].apply(str.lower)
df['name'] = df['name'].apply(str.lower)
data = df.groupby(['customform','name'])['fieldlabel'].apply(list).reset_index()
display(data)

            Kunde   customname  customform              name  fieldobjectid  \
0     abinventech      form876         876  Kundereklamation            877   
1     abinventech      form876         876  Kundereklamation            878   
2     abinventech      form876         876  Kundereklamation            879   
3     abinventech      form876         876  Kundereklamation            880   
4     abinventech      form876         876  Kundereklamation            881   
...           ...          ...         ...               ...            ...   
9446          ipw  form1683675     1683675   SI - Handleplan        1711069   
9447          ipw  form1683793     1683793  SI - Aktiviteter        1711096   
9448          ipw  form1683793     1683793  SI - Aktiviteter        1711098   
9449          ipw  form1683675     1683675   SI - Handleplan        1711129   
9450          ipw  form1683793     1683793  SI - Aktiviteter        1711192   

      fieldparentid    fieldparenttype elementtype 

Unnamed: 0,customform,name,fieldlabel
0,770,projektportefølje,"[projekt nr., oprettet den, oprettet af, proje..."
1,775,projektområde,[tekst]
2,876,claims,"[processor, description of claim, complaint ca..."
3,876,kundereklamation,"[vælg kunde, gadenavn, postnr, by, telefonnumm..."
4,876,reklamation,"[vælg kunde, e-mail, kontaktperson, behandles ..."
...,...,...,...
753,1683707,strategiske initiativer,"[tekst, mwb, id, periode]"
754,1683712,si - must win battles,[mwb]
755,1683736,si - perioder,[periode]
756,1683793,si - aktiviteter,"[aktivitet, deadline, status, superviserende l..."


In [4]:
#handle blocktype(removeit).
dfVal = pd.read_csv('all_data-validationCSV.csv', delimiter=';', on_bad_lines='skip')
indices_to_drop2 = dfVal[dfVal['elementtype'] == 'block'].index
dfVal = dfVal.drop(indices_to_drop2)

#put specialtype in place.
for index, row in dfVal.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        dfVal.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

dfVal = dfVal.drop(columns=['Kunde','fieldtype','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
dfVal['fieldlabel'] = dfVal['fieldlabel'].apply(str.lower)
dfVal['name'] = dfVal['name'].apply(str.lower)
dataVal = dfVal.groupby(['customform','name'])['fieldlabel'].apply(list).reset_index()
display(dataVal)
#data.to_excel('validatedAllData.xlsx', index=False)

Unnamed: 0,customform,name,fieldlabel
0,493061,oprettelse af ipw.dk-site,"[server, manuelt tildeling af server, anmoder,..."
1,695920,budgettal,"[debiteret support, kursusvirksomhed]"
2,829798,lead til salg,"[ansvarlig, lead via, status fra salg, lead op..."
3,854278,normtid pr uge,"[udvikling ikke planlagt, sum]"
4,894000,reference demo,[oprettet af]
...,...,...,...
125,2584010,formularkatalog,"[oprettet af, oprettet, blank linje, seneste ã..."
126,2584018,formularkatalog - kategori,[kategori]
127,2584251,formularkatalog - ã˜vrige vurderinger,"[hvad gã¸r formularen god?, vurderet af, vurde..."
128,2584255,formularkatalog - vurderingsskala,[vurdering]


# Prepare Data for Training

## Create a Mapping Between Label and Index

1. **Label to Index Mapping**:  
   Create a mapping between the **field label** and a unique index. This allows us to efficiently handle categorical data during training.

## Create Encoding for "Field Label"

2. **Field Label Encoding**:  
   Encode the **"fieldlabel"** by counting how many times each label appears within a form. This gives us a numeric representation of label frequency in the form.

## Create Embedding for "Form Name"

3. **Form Name Embedding**:  
   Create an embedding for the **"form name"** using Tokenizer This will convert the form name into a dense vector representation.

## Pad the Embedding

4. **Padding the Embedding**:  
   To ensure uniformity in input length, **pad the embeddings** so that all embeddings have the same length. This is necessary for feeding the data into a model.

## Split the Data for Testing

5. **Testing Data Split**:  
   Split the dataset, keeping the last **100 forms** aside for testing purposes. This will allow us to evaluate the performance of the model on unseen data.



In [5]:

# Get all unique labels
all_labels = list(set(label for labels in data['fieldlabel'] for label in labels))

# Map labels to indices
label_to_index = {label: i for i, label in enumerate(all_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

# Total number of examples
total_samples = len(data)

# Split index (last 100 for testing, adjust for your data size)
split_index = max(0, total_samples - 100)

# Training and test split
train_data = data.iloc[:split_index]  # First part for training
test_data = data.iloc[split_index:]  # Last 100 for testing

# Prepare training data
X_train_names = train_data['name'].tolist()
Y_train_labels = np.zeros((len(train_data), len(all_labels)), dtype=np.float32)

#set one-hot encoding for field labels
for i, labels in enumerate(train_data['fieldlabel']):
    Y_train_labels[i, [label_to_index[label] for label in labels]] = 1

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_names) 
X_train_sequences = tokenizer.texts_to_sequences(X_train_names)
max_name_len = max(len(seq) for seq in X_train_sequences)  
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_name_len, padding='post')


# Prepare test data
X_test_names = test_data['name'].tolist()
Y_test_labels = np.zeros((len(test_data), len(all_labels)), dtype=np.float32)

for i, labels in enumerate(test_data['fieldlabel']):
    for label in labels:
        Y_test_labels[i, label_to_index[label]] = 1

X_test_sequences = tokenizer.texts_to_sequences(X_test_names)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_name_len, padding='post')


## Model Architecture

The model is composed of two main components:

### 1. LSTM (Sequence to Vector)

- **Purpose**: The first part of the model uses an **LSTM** (Long Short-Term Memory) network. This component is designed to process the sequence of embedded form names and convert them into a vector representation.
  
- **Input**: The input to the LSTM is the **embedded form name**, which has been preprocessed and embedded into a vector format.
  
- **Output**: The output of the LSTM is a dense vector that represents the form, capturing the sequence of the form name and its contextual information.

### 2. Multi-output Classifier

- **Purpose**: The second part of the model is a **multi-output classifier**. This classifier predicts the probability of the appearance of each label in a given form.


In [6]:
from tensorflow.keras.callbacks import EarlyStopping

vocab_size = len(tokenizer.word_index) + 1  # Include padding
embedding_dim = 128  # Dimension of embedding space
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

name_input = Input(shape=(max_name_len,))

# Embedding layer with masking enabled
x = Embedding(vocab_size, embedding_dim, mask_zero=True)(name_input)

# LSTM layer for sequence-to-vector encoding
x = LSTM(32, return_sequences=False)(x)

# Output layer for multi-label classification
output = Dense(len(all_labels), activation='sigmoid')(x)

# Build the model
model = Model(name_input, output)

# Compile the model with an optimizer and loss function
model.compile(optimizer= optimizer, loss='mse', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='loss',
    patience=20,
    restore_best_weights=True
)

# Train the model
model.fit(X_train_padded, Y_train_labels, epochs=2000, batch_size=32, callbacks=[early_stopping])
#model.fit(X_train_padded, Y_train_labels, epochs=500, batch_size=32)


Epoch 1/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 0.2173
Epoch 2/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 6.5560e-04 - loss: 0.0075
Epoch 3/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0043 - loss: 0.0024   
Epoch 4/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0018 - loss: 0.0025   
Epoch 5/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0584 - loss: 0.0022
Epoch 6/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 8.3673e-04 - loss: 0.0026
Epoch 7/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0382 - loss: 0.0024   
Epoch 8/2000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0569 - loss: 0.0022   
Epoch 9/2000
[1

<keras.src.callbacks.history.History at 0x16b11274350>

## Testing

In [7]:
test_loss, test_accuracy = model.evaluate(X_test_padded, Y_test_labels, batch_size=32)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0000e+00 - loss: 0.0811 
Test Loss: 0.07724229246377945, Test Accuracy: 0.0


## Predictions
print out the labels with more that 40% chance to appeare

In [8]:
# After training, predict using a test book name
form_name = "Sikkerhed"
name_seq = tokenizer.texts_to_sequences([form_name])
name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')

predictions = model.predict(name_padded)
predicted_labels = (predictions > 0.80).astype(int)  # Apply a threshold to get binary predictions

# Convert indices back to capture names
predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]

print(f"Predicted Captures for '{form_name}': {predicted_labels}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Predicted Captures for 'Sikkerhed': ['upload filer', 'er opfølgning nødvendig?', 'type', 'billede 03', 'oprettet af', 'det har jeg gjort', 'årsagsanalyse', 'vælg behandler', 'beskrivelse', 'ansvarlig', 'billeder', 'hvad skal gøres for, at risikoen for en ulykke fjernes?', 'nummer', 'billede 02', 'her og nu tiltag', 'årsag', 'afdeling for hændelsen']


In [9]:
# Convert indices back to capture names
testtest = [index_to_label[idx] for idx in X_test_padded.flatten()]
labeltest = [index_to_label[idx] for idx in Y_test_labels.flatten()]

maxPoints = 0;
overalPoints = 0;
negativePoints = 0;
negativePoints2 = 0;
fieldlabels = [];

# Grab form names of Validation Set
for row in dataVal.index:
    form_name = dataVal.loc[row, "name"]
    name_seq = tokenizer.texts_to_sequences([form_name])
    name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')
    predictions = model.predict(name_padded)
    predicted_labels = (predictions > 0.90).astype(int)  # Apply a threshold to get binary predictions
    # Convert indices back to capture names
    predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]

    fieldlabels = dataVal.loc[row, "fieldlabel"]
    
    for predLabel in predicted_labels:
        if(predLabel in fieldlabels):
            overalPoints += 1;
        else:
            negativePoints += 1;
                
    for label in predicted_labels:
        negativePoints2 += 1;

print(overalPoints)
print(negativePoints)
print(negativePoints2)
precision = overalPoints / (overalPoints + negativePoints)
precision = precision * 100;
print(f"Precision:'{precision}'")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22

In [10]:
def SetupDataForPrediction(data):
    name_seq = tokenizer.texts_to_sequences([data])
    name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')
    return name_padded

In [11]:
!pip install flask
!pip install joblib
!pip install flask-cors

from flask import Flask, request, jsonify
import numpy as np
import joblib  # or pickle if you use pickle
from joblib import dump

dump(model, 'formModel.pkl')
from flask_cors import CORS




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:


app = Flask(__name__)
CORS(app) 


# Load the model and print it
try:
    modelv2 = joblib.load("formModel.pkl")
    print("Model loaded successfully:", modelv2)
except Exception as e:
    print("Error loading model:", e)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        print("Model loaded successfully:", modelv2)
        # Get input data from the request
        input_data = request.json['data']  # Assuming data comes in JSON format
        print(input_data)
        

        input_data = SetupDataForPrediction(input_data)
        try:
            prediction = modelv2.predict(input_data)
            print("Raw prediction output:", prediction)
        except Exception as e:
            print("Error during prediction:", str(e))
            import traceback
            print("Traceback:", traceback.format_exc())
            return jsonify({'error': 'Prediction failed. Check input data or model.'}), 500
            

        predicted_labels = (prediction > 0.2).astype(int)  # Apply a threshold to get binary predictions

        # Convert indices back to capture names
        predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]
        
        print(f"Predicted Captures for '{form_name}': {predicted_labels}")
        
        # Return prediction as JSON
        return jsonify({'prediction': predicted_labels})
    except Exception as e:
        return jsonify({'error': str(e)})

Model loaded successfully: <Functional name=functional, built=True>


In [None]:
from werkzeug.serving import run_simple
# Use run_simple to run Flask
run_simple('127.0.0.1', 5000, app)

 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
