In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors

In [2]:
#reading the data
data = pd.read_csv('data/all_data.csv', on_bad_lines='skip')
display(data)

Unnamed: 0,Kunde,customname,customform,name,fieldobjectid,fieldparentid,fieldparenttype,elementtype,fieldtype,blocktype,specialtype,fieldrelation,fieldlabel,language
0,abinventech,form876,876,Kundereklamation,877,878.0,customformelement,field,listselect,,,form2488,Vælg kunde,DA
1,abinventech,form876,876,Kundereklamation,878,,customform,block,,layout_fieldset,,,Kundeoplysninger,DA
2,abinventech,form876,876,Kundereklamation,879,878.0,customformelement,field,string,,,,Gadenavn,DA
3,abinventech,form876,876,Kundereklamation,880,878.0,customformelement,field,string,,,,Postnr,DA
4,abinventech,form876,876,Kundereklamation,881,878.0,customformelement,field,string,,,,By,DA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10331,ipw,form1927875,1927875,Azure - Udløb af client secret,2610308,2592044.0,customformelement,special,,,createdby,,Ansvarlig for opsætning,DA
10332,ipw,form1329139,1329139,Leverancer,2612534,,customform,block,,layout_fieldset,,,UDGÅET felter,DA
10333,ipw,form1329139,1329139,Leverancer,2612540,1361554.0,customformelement,special,,,layout_text,,Installationen oprettes på domænet xxxx.ipw.dk,DA
10334,ipw,form1329139,1329139,Leverancer,2612546,1361554.0,customformelement,special,,,layout_text,,Installationen oprettes på domænet <b>xxxx.ipw...,DA


# Data Processing


## Field Type Handling

1. Ensure that the correct field type values are taken from the data source (special).
2. Write these values into their corresponding `fieldtype` columns.

## Grouping Forms

- Group the forms based on their **ID** and **Name**.
- This allows for better organization and easier access to related data.

## Mapping Field Types and Labels

- After grouping, map each **fieldtype** with its corresponding **field label**.
- This mapping ensures that each field is appropriately identified and categorized for further use.


In [3]:
# Remove all elements of type block.(maybe late include it)
indices_to_drop = data[data['elementtype'] == 'block'].index

# Drop those indices
data = data.drop(indices_to_drop)

# If the element is special the actual fieldtype is in specialtype

data['isspecial'] = data['elementtype'].apply(lambda x: 1 if x == 'special' else 0)

for index, row in data.iterrows():
    if row['isspecial'] == 1:
        if pd.notnull(row['fieldtype']):  # Check if 'fieldtype' is already filled
            print(f"Row {index}: 'fieldtype' already has a value before assigning 'specialtype'")
        else:
            # If 'fieldtype' is empty, assign the value from 'specialtype'
            data.at[index, 'fieldtype'] = row['specialtype']

#Drop non-relevant columns.
data = data.drop(columns=['Kunde','isspecial','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language','specialtype'])

#Lets group the data by customform (id) and include name and mape the structure.
result = data.groupby(['customform', 'name']).agg({
    'fieldtype': lambda x: list(x),  # Keep all field types as a list
    'fieldlabel': lambda x: list(x)  # keep all field labels as a list
}).reset_index()

display(result)

Unnamed: 0,customform,name,fieldtype,fieldlabel
0,770,Projektportefølje,"[autonum, created, createdby, relation, string...","[Projekt nr., Oprettet den, Oprettet af, Proje..."
1,775,Projektområde,[string],[Tekst]
2,876,Claims,"[userrelation, text, relation, text, createdby...","[Processor, Description of claim, Complaint ca..."
3,876,Kundereklamation,"[listselect, string, string, string, string, s...","[Vælg kunde, Gadenavn, Postnr, By, Telefonnumm..."
4,876,Reklamation,"[listselect, string, string, userrelation, rel...","[Vælg kunde, E-mail, Kontaktperson, Behandles ..."
...,...,...,...,...
854,2584010,Formularkatalog,"[createdby, created, layout_linebreak, changed...","[Oprettet af, Oprettet, Blank linje, Seneste æ..."
855,2584018,Formularkatalog - Kategori,[string],[Kategori]
856,2584251,Formularkatalog - Øvrige vurderinger,"[text, userrelation, relation, text]","[Hvad gør formularen god?, Vurderet af, Vurder..."
857,2584255,Formularkatalog - Vurderingsskala,[integer],[Vurdering]


# Embeddings

In this section, we focus on converting form data into sentence embeddings and storing them.

## Converting Form to Sentence

1. Convert each form into a sentence that includes the following details:
   - **Form Name**
   - **Field Type**
   - **Field Label**
## Creating Embeddings

2. After generating the sentence, create an embedding for each one. This embedding represents the form as a vector for easier comparison and analysis.

3. Store these embeddings in a new column called `['combined_embedding']` for each form.


In [4]:
 #Combine the text into a single list for Word2Vec
all_texts = [result['name'].tolist()] + result['fieldtype'].tolist() + result['fieldlabel'].tolist()

# Flatten the list of lists to a single list of strings
all_texts = [str(item) for sublist in all_texts for item in sublist]

# Tokenize the text into lists of words
tokenized_texts = [text.split() for text in all_texts]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Create Embeddings
def get_embedding(text):
    # If the input is a list, join it to form a single string
    if isinstance(text, list):
        # Convert all elements to string and filter out None or NaN values
        text = ' '.join(str(item) for item in text if pd.notna(item))
    
    # Handle case where text might still be empty after filtering
    if not text:
        return [0] * model.vector_size

    # Tokenize the text
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:  # In case no words are in the model
        return [0] * model.vector_size
    
    # Average the word vectors
    return np.mean(word_vectors, axis=0)

def combine_fields(row):
    # Combine the name with all elements from fieldtype and fieldlabel arrays
    fields = [row['name']] + list(row['fieldtype']) + list(row['fieldlabel'])
    
    # Join all values into a single string, ensuring they're all converted to strings
    return ' '.join(str(item) for item in fields if pd.notna(item))

# Apply the function to create a combined text column
result['combined_text'] = result.apply(combine_fields, axis=1)

# Generate embeddings for the combined text
result['combined_embedding'] = result['combined_text'].apply(get_embedding)
# Display the results
display(result[['name', 'fieldtype', 'fieldlabel','combined_embedding']])

Unnamed: 0,name,fieldtype,fieldlabel,combined_embedding
0,Projektportefølje,"[autonum, created, createdby, relation, string...","[Projekt nr., Oprettet den, Oprettet af, Proje...","[-0.020117719, 0.024822399, 0.008052918, 0.011..."
1,Projektområde,[string],[Tekst],"[0.0011589956, -0.00017239463, 0.0038565947, 0..."
2,Claims,"[userrelation, text, relation, text, createdby...","[Processor, Description of claim, Complaint ca...","[-0.021911759, 0.027647354, 0.006353144, 0.012..."
3,Kundereklamation,"[listselect, string, string, string, string, s...","[Vælg kunde, Gadenavn, Postnr, By, Telefonnumm...","[-0.03126676, 0.038801298, 0.01043382, 0.02038..."
4,Reklamation,"[listselect, string, string, userrelation, rel...","[Vælg kunde, E-mail, Kontaktperson, Behandles ...","[-0.035052627, 0.044346053, 0.01288192, 0.0216..."
...,...,...,...,...
854,Formularkatalog,"[createdby, created, layout_linebreak, changed...","[Oprettet af, Oprettet, Blank linje, Seneste æ...","[-0.045399576, 0.05695349, 0.015131215, 0.0266..."
855,Formularkatalog - Kategori,[string],[Kategori],"[-0.058864407, 0.07338592, 0.019110123, 0.0345..."
856,Formularkatalog - Øvrige vurderinger,"[text, userrelation, relation, text]","[Hvad gør formularen god?, Vurderet af, Vurder...","[-0.037632193, 0.04930217, 0.012702091, 0.0197..."
857,Formularkatalog - Vurderingsskala,[integer],[Vurdering],"[-0.056975443, 0.07267847, 0.01962966, 0.03325..."


# Model Training

we will focus on training a machine learning model using the embeddings created previously.

## Fitting the Combined Embedding to K-Nearest Neighbors (KNN)

1. **Fit the `combined_embedding`** data to a K-Nearest Neighbors (KNN) model. This model will allow us to identify the similarity between different forms based on their embeddings.

## Returning the Nearest Neighbors

3. Once the model is trained, use it to return the **nearest 5 neighbors** for any given form. This will help in identifying similar forms based on their embeddings.



In [6]:
knn = NearestNeighbors(n_neighbors=5)  # We want the 5 closest neighbor
knn.fit(np.vstack(result['combined_embedding'].values))

# Testing the Model

we will test the K-Nearest Neighbors (KNN) model using a new form and perform the necessary steps to analyze the results.

## Data Processing for New Form

1. **Input**: Provide the model with a new form consisting of:
   - **Form Name**
   - **3 Fields** (including field types and labels)

2. **Apply the same data processing** steps as done during training.

## Finding the Nearest Neighbors

3. Use the trained KNN model to find the **5 nearest neighbors** of the new form based on the `combined_embedding`.

## Filtering Common Fields

4. From the output, **filter out the common fields** between the new form and its nearest neighbors. This will help us focus on the unique aspects of the new form.

## Printing Uncommon Labels

5. Finally, print the **uncommon labels** from the nearest neighbors to highlight the differences. This will help us understand what makes the new form unique compared to the others.

In [13]:
# Example form to predict
new_name = "Sikkerhed"
fieldtypes = ['createdby', 'created', 'responsible']
labels = ['Oprettet af', 'Oprettelsesdato', 'Ansvarlig']

combined_text = ' '.join([new_name] + fieldtypes + labels)
cembedding =get_embedding(combined_text)
cembedding = cembedding.reshape(1, -1)


distances, indices = knn.kneighbors(cembedding)

closest_points = result.iloc[indices[0]]  

# Collect all field labels from the nearest neighbors
all_field_labels = [result.iloc[index]['fieldlabel'] for index in indices[0]]

# Flatten the list of field labels and normalize to lowercase
flattened_labels = [label.lower() for labels in all_field_labels for label in labels]

# Count the occurrences of each normalized field label
from collections import Counter
label_counts = Counter(flattened_labels)

# Separate common and uncommon labels
common_labels = [label for label, count in label_counts.items() if count > 3]
uncommon_labels = [label for label, count in label_counts.items() if count == 1]

print("Common Field Labels (case-insensitive):")
for label in common_labels:
    print(label)

print('-----------------------------------------------------')
print("\nUncommon Field Labels (case-insensitive):")
for label in uncommon_labels:
    print(label)

Common Field Labels (case-insensitive):
oprettet
oprettet af
ansvarlig
nummer
evt. relateret afvigelse
følgende udføres
deadline
vælg ansvarlig
hvad er udført
udført dato
verificeret dato
kommentar
evt. relateret reklamation
kundens fejlbeskrivelse
blank 15px
beskrivelse af afvigelsen
blank 20
send information til
vælg kunde
gadenavn
postnr
by
telefonnummer
e-mail
kontaktperson
kundens reference
behandles af
handling der førte til reklamationen
valgt løsning
omkostning, kr
årsag til reklamationen
varenummer
varenavn
vedhæft filer
navn
reklamation opstået hos
antal
enheder
batchnr
fejlkode
fejlkategori
korrigerende handling(er)
vælg behandler
bemærkning
det har jeg gjort
afvigelsen skete den
korrigerende handling
relateret til audit nr.
risikovurdering
er problemet tilstrækkeligt løst?
bemærkninger
-----------------------------------------------------

Uncommon Field Labels (case-insensitive):
løn prod. (kr)
tegnestuen timer
løn tegnestuen (kr)
godkendes af
godkendelse
titel
relateret k