In [1]:
!pip install numpy
!pip install pandas
!pip install tensorflow





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Data Processing

This section outlines the steps involved in processing the data before further analysis or modeling.


1. General Data Processing.

2. Convert the following fields to lowercase for uniformity:
   - **Name**
   - **fieldlabel**

   This ensures that comparisons are case-insensitive.

## Grouping Forms

3. Group the forms based on:
   - **Form ID**(customform)
   - **name**


In [3]:

#df = pd.read_csv('data/all_data.csv', on_bad_lines='skip')
df = pd.read_csv('all_data.csv', on_bad_lines='skip')

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
df['fieldlabel'] = df['fieldlabel'].apply(str.lower)
df['name'] = df['name'].apply(str.lower)
data = df.groupby(['customform','name'])['fieldlabel'].apply(list).reset_index()
display(data)

Unnamed: 0,customform,name,fieldlabel
0,770,projektportefølje,"[projekt nr., oprettet den, oprettet af, proje..."
1,775,projektområde,[tekst]
2,876,claims,"[processor, description of claim, complaint ca..."
3,876,kundereklamation,"[vælg kunde, gadenavn, postnr, by, telefonnumm..."
4,876,reklamation,"[vælg kunde, e-mail, kontaktperson, behandles ..."
...,...,...,...
854,2584010,formularkatalog,"[oprettet af, oprettet, blank linje, seneste æ..."
855,2584018,formularkatalog - kategori,[kategori]
856,2584251,formularkatalog - øvrige vurderinger,"[hvad gør formularen god?, vurderet af, vurder..."
857,2584255,formularkatalog - vurderingsskala,[vurdering]


# Prepare Data for Training

## Create a Mapping Between Label and Index

1. **Label to Index Mapping**:  
   Create a mapping between the **field label** and a unique index. This allows us to efficiently handle categorical data during training.

## Create Encoding for "Field Label"

2. **Field Label Encoding**:  
   Encode the **"fieldlabel"** by counting how many times each label appears within a form. This gives us a numeric representation of label frequency in the form.

## Create Embedding for "Form Name"

3. **Form Name Embedding**:  
   Create an embedding for the **"form name"** using Tokenizer This will convert the form name into a dense vector representation.

## Pad the Embedding

4. **Padding the Embedding**:  
   To ensure uniformity in input length, **pad the embeddings** so that all embeddings have the same length. This is necessary for feeding the data into a model.

## Split the Data for Testing

5. **Testing Data Split**:  
   Split the dataset, keeping the last **100 forms** aside for testing purposes. This will allow us to evaluate the performance of the model on unseen data.



In [4]:

# Get all unique labels
all_labels = list(set(label for labels in data['fieldlabel'] for label in labels))

# Map labels to indices
label_to_index = {label: i for i, label in enumerate(all_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

# Total number of examples
total_samples = len(data)

# Split index (last 100 for testing, adjust for your data size)
split_index = max(0, total_samples - 100)

# Training and test split
train_data = data.iloc[:split_index]  # First part for training
test_data = data.iloc[split_index:]  # Last 100 for testing

# Prepare training data
X_train_names = train_data['name'].tolist()
Y_train_labels = np.zeros((len(train_data), len(all_labels)), dtype=np.float32)

#set one-hot encoding for field labels
for i, labels in enumerate(train_data['fieldlabel']):
    Y_train_labels[i, [label_to_index[label] for label in labels]] = 1

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_names) 
X_train_sequences = tokenizer.texts_to_sequences(X_train_names)
max_name_len = max(len(seq) for seq in X_train_sequences)  
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_name_len, padding='post')


# Prepare test data
X_test_names = test_data['name'].tolist()
Y_test_labels = np.zeros((len(test_data), len(all_labels)), dtype=np.float32)

for i, labels in enumerate(test_data['fieldlabel']):
    for label in labels:
        Y_test_labels[i, label_to_index[label]] = 1

X_test_sequences = tokenizer.texts_to_sequences(X_test_names)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_name_len, padding='post')


## Model Architecture

The model is composed of two main components:

### 1. LSTM (Sequence to Vector)

- **Purpose**: The first part of the model uses an **LSTM** (Long Short-Term Memory) network. This component is designed to process the sequence of embedded form names and convert them into a vector representation.
  
- **Input**: The input to the LSTM is the **embedded form name**, which has been preprocessed and embedded into a vector format.
  
- **Output**: The output of the LSTM is a dense vector that represents the form, capturing the sequence of the form name and its contextual information.

### 2. Multi-output Classifier

- **Purpose**: The second part of the model is a **multi-output classifier**. This classifier predicts the probability of the appearance of each label in a given form.


In [5]:
vocab_size = len(tokenizer.word_index) + 1  # Include padding
embedding_dim = 128  # Dimension of embedding space
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

name_input = Input(shape=(max_name_len,))

# Embedding layer with masking enabled
x = Embedding(vocab_size, embedding_dim, mask_zero=True)(name_input)

# LSTM layer for sequence-to-vector encoding
x = LSTM(32, return_sequences=False)(x)

# Output layer for multi-label classification
output = Dense(len(all_labels), activation='sigmoid')(x)

# Build the model
model = Model(name_input, output)

# Compile the model with an optimizer and loss function
model.compile(optimizer= optimizer, loss='mse', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, Y_train_labels, epochs=500, batch_size=32)


Epoch 1/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.2119
Epoch 2/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0057 - loss: 0.0060   
Epoch 3/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0022 - loss: 0.0019   
Epoch 4/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0068 - loss: 0.0022   
Epoch 5/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0279 - loss: 0.0020
Epoch 6/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0435 - loss: 0.0021
Epoch 7/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0475 - loss: 0.0019   
Epoch 8/500
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0082 - loss: 0.0019   
Epoch 9/500
[1m24/24[0m [3

<keras.src.callbacks.history.History at 0x2c66d9e9750>

## Testing

In [6]:
test_loss, test_accuracy = model.evaluate(X_test_padded, Y_test_labels, batch_size=32)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0398 - loss: 0.0685      
Test Loss: 0.06896641850471497, Test Accuracy: 0.05000000074505806


## Predictions
print out the labels with more that 40% chance to appeare

In [7]:
# After training, predict using a test book name
form_name = "Sikkerhed"
name_seq = tokenizer.texts_to_sequences([form_name])
name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')

predictions = model.predict(name_padded)
predicted_labels = (predictions > 0.51).astype(int)  # Apply a threshold to get binary predictions

# Convert indices back to capture names
predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]

print(f"Predicted Captures for '{form_name}': {predicted_labels}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
Predicted Captures for 'Sikkerhed': ['kommentar', 'vælg behandler', 'billede 02', 'type', 'det har jeg gjort', 'nummer', 'afdeling for hændelsen', 'er opfølgning nødvendig?', 'ansvarlig', 'her og nu tiltag', 'årsagsanalyse', 'billede 03', 'oprettet af', 'hvad skal gøres for, at risikoen for en ulykke fjernes?', 'upload filer', 'beskrivelse', 'billeder']


In [8]:
def SetupDataForPrediction(data):
    name_seq = tokenizer.texts_to_sequences([data])
    name_padded = pad_sequences(name_seq, maxlen=max_name_len, padding='post')
    return name_padded

In [9]:
!pip install flask
!pip install joblib
!pip install flask-cors

from flask import Flask, request, jsonify
import numpy as np
import joblib  # or pickle if you use pickle
from joblib import dump

dump(model, 'formModel.pkl')
from flask_cors import CORS




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [10]:


app = Flask(__name__)
CORS(app) 


# Load the model and print it
try:
    modelv2 = joblib.load("formModel.pkl")
    print("Model loaded successfully:", modelv2)
except Exception as e:
    print("Error loading model:", e)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        print("Model loaded successfully:", modelv2)
        #print("Raw request data:", request.data)
        # Get input data from the request
        input_data = request.json['data']  # Assuming data comes in JSON format
        print(input_data)
        
        # Prepare the data (ensure it's in the correct format for the model)
        #prepared_data = np.array(input_data).reshape(1, -1)  # Adjust as per your model's requirements

        input_data = SetupDataForPrediction(input_data)
        #print(input_data)
        # Predict using the model
        try:
            prediction = modelv2.predict(input_data)
            print("Raw prediction output:", prediction)
        except Exception as e:
            print("Error during prediction:", str(e))
            import traceback
            print("Traceback:", traceback.format_exc())
            return jsonify({'error': 'Prediction failed. Check input data or model.'}), 500
            
        #print("Raw prediction output:", prediction)

        predicted_labels = (prediction > 0.2).astype(int)  # Apply a threshold to get binary predictions

        # Convert indices back to capture names
        predicted_labels = [index_to_label[idx] for idx in np.where(predicted_labels[0] == 1)[0]]
        
        print(f"Predicted Captures for '{form_name}': {predicted_labels}")
        
        # Return prediction as JSON
        return jsonify({'prediction': predicted_labels})
    except Exception as e:
        return jsonify({'error': str(e)})

Model loaded successfully: <Functional name=functional, built=True>


In [None]:
from werkzeug.serving import run_simple
# Use run_simple to run Flask
run_simple('127.0.0.1', 5000, app)

 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [03/Dec/2024 14:53:57] "OPTIONS /predict HTTP/1.1" 200 -


Model loaded successfully: <Functional name=functional, built=True>
Afvigelse
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
Raw prediction output:

127.0.0.1 - - [03/Dec/2024 14:53:58] "POST /predict HTTP/1.1" 200 -


 [[1.0794083e-05 2.7221811e-03 7.4516784e-04 ... 4.4303341e-04
  4.6056928e-04 1.3020888e-04]]
Predicted Captures for 'Sikkerhed': ['hvor blev afvigelsen opdaget?', 'maskiner', 'leverandørnavn', 'risikovurdering (skjules)', 'sagsbehandler', 'break-even beregning', 'objectid', '8d rapporter', 'tilknyt filer', 'blank 15px', 'konklusion', 'opret root cause - udgået', 'deadline', 'vælg behandler', 'funktionær timer', 'konklusion årsagsanalyse', 'procedure', 'hvad kan vi lære heraf?', 'ansvarlig godkender', 'evt aktivitet', 'ansvarlig godkender (medarbejder - udgår)', 'vedhæft dokumentation', 'metode', 'årsagsanalyse vedlagt', 'sagsbehandling 700 kr', 'det har jeg gjort', 'afvigelsen skete den', 'nummer', 'id-nr.:', 'send orientering til', 'afdeling', 'bemærkninger', 'materialer (kr)', 'opfølgningsdato', 'angiv hvorfor korrigerende handling(er) ikke er nødvendigt.', 'bemærkning', 'vælg leverandør (anvendes når kunde er klar)', 'hvor opstod afvigelsen?', 'måling', 'afvigelseskategori', 'afvi