In [51]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, recall_score


**Step 1:** Upload the preprocessed data that contains only form names and fieldlabels

In [3]:

df = pd.read_csv('c:/Users/migle/Desktop/BPR/IPWFormAi/data/all_data.csv', on_bad_lines='skip')

#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','customform','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
display(df)

Unnamed: 0,name,fieldlabel
0,Kundereklamation,Vælg kunde
2,Kundereklamation,Gadenavn
3,Kundereklamation,Postnr
4,Kundereklamation,By
5,Kundereklamation,Telefonnummer
...,...,...
10330,Intern IT Support,"Registrerer du tid, kan du bruge flg. opg.nr: ..."
10331,Azure - Udløb af client secret,Ansvarlig for opsætning
10333,Leverancer,Installationen oprettes på domænet xxxx.ipw.dk
10334,Leverancer,Installationen oprettes på domænet <b>xxxx.ipw...


**Step 2:** Additional step in data preprocessing is concatinatting all fieldlabels to a form name so that they stand all together per form name

In [19]:

# Step 1: Group by 'Form name' and aggregate the field labels into a list
grouped = df.groupby('name')['fieldlabel'].apply(list).reset_index()

# Step 2: Expand the lists into separate columns (using pd.DataFrame to handle different numbers of labels)
expanded = pd.DataFrame(grouped['fieldlabel'].tolist())

# Step 3: Combine the 'Form name' column with the expanded field labels
result = pd.concat([grouped['name'], expanded], axis=1)

# Step 4: Rename the columns appropriately
result.columns = ['name'] + [f'fieldlabel {i+1}' for i in range(result.shape[1] - 1)]

display(result)

Unnamed: 0,name,fieldlabel 1,fieldlabel 2,fieldlabel 3,fieldlabel 4,fieldlabel 5,fieldlabel 6,fieldlabel 7,fieldlabel 8,fieldlabel 9,...,fieldlabel 245,fieldlabel 246,fieldlabel 247,fieldlabel 248,fieldlabel 249,fieldlabel 250,fieldlabel 251,fieldlabel 252,fieldlabel 253,fieldlabel 254
0,*_270224_OBT - Alarmering,Emne,Link,Læst og forstået,Ikke forstået,Instruktør,,,,,...,,,,,,,,,,
1,*_270224_OBT - Andet,Emne,Link,Ikke forstået,Læst og forstået,Instruktør,,,,,...,,,,,,,,,,
2,*_270224_OBT - Beredskab,Link,Emne,Instruktør,Ikke forstået,Læst og forstået,,,,,...,,,,,,,,,,
3,*_270224_OBT - Brandbekæmpelse,Link,Emne,Instruktør,Ikke forstået,Læst og forstået,,,,,...,,,,,,,,,,
4,*_270224_OBT - Førstehjælp og udstyr,Link,Emne,Instruktør,Ikke forstået,Læst og forstået,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,amnj-kundereklamation,Kunde navn,Gadenavn,Post nr.,By,Tlf. nr.,E-mail,Reklamations nr.,Kontaktperson,Vare nr.,...,,,,,,,,,,
833,amnj-leverandørreklamation,Reklamations nr.,Leverandørnavn,Gadenavn,Post nr.,By,Tlf. nr.,E-mail,Kontaktperson,Vare nr.,...,,,,,,,,,,
834,decimaltester,Input 1,Input 2,Output,Audit inspection score,,,,,,...,,,,,,,,,,
835,nj test create,opret,tjekboks,,,,,,,,...,,,,,,,,,,


In [23]:
y = pd.get_dummies(df['fieldlabel'])

y = df.pivot_table(index='name', columns='fieldlabel', aggfunc=lambda x: 1, fill_value=0)

# Resetting index to get 'Form name' as a column again
y = y.reset_index()

# Step 3: Define X (Form names)
X = y['name']  # Form names as input

# Step 4: Remove 'Form name' from y after defining X
y = y.drop(columns=['name'])  # Only the field labels remain in y



In [33]:
#make sure that we are working with more than 1 of unique classes in fieldlabels so that we can use 'stratify' when splitting data

unique_classes_per_column = y.nunique()
columns_to_keep = unique_classes_per_column[unique_classes_per_column > 1].index
y_filtered = y[columns_to_keep]


In [34]:
print(X.shape)
print(y_filtered.shape)

(837,)
(837, 4596)


In [36]:
# Ensure both X and y_filtered have the same index
X = X.reset_index(drop=True)
y_filtered = y_filtered.reset_index(drop=True)

**Step 3:** Splitting our data into train and test.

In our case, stratify does not work. Could be due to imbalanced labels and classes do not have enough samples. By not being able to use stratify when splitting data it can provide a challenge to ensure that train and test data maintain the proportion as in the original dataset. 

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y_filtered, test_size=0.2, random_state=42)

Using a random forest classifier to be able to capture more complex relationships in the data. Using Bag of Words because our input data is text (form names)

In [54]:
#Define the multioutput classifier using BoW 
from sklearn.ensemble import RandomForestClassifier

model = make_pipeline(CountVectorizer(), MultiOutputClassifier(RandomForestClassifier()))


When trying to fit the data into the model, it complains about the number of classes. Therefore, we are performing additional steps to ensure the correct number of unique classes across the dataset. 

In [55]:
# Checking for columns with only one unique value
single_class_labels = y_train.columns[y_train.nunique() == 1]
print(f"Columns with only one class in y_train: {single_class_labels}")


Columns with only one class in y_train: Index(['&#128992; % REMARK', '1. Date completing self ass.', '1. Produkt Smag',
       '1. produkt udseende', '10. Number of permanent employees?',
       '10. Produkt Smag', '10. Produkt Udseende',
       '11. Number of temporary employees?', '11. Produkt Smag',
       '11. Produkt udseende',
       ...
       'Ændret d.', 'Ændringer gennemført', 'Økolog', 'Økologisk',
       'Økonomi ansl.', 'Økonomisk konsekvens',
       'Ønsker I yderligere undervisning',
       'Ønsker I, at deltage i en Erfa-gruppe',
       'Øvrige ressourcer fra Leverancen', 'Útfygt hevur'],
      dtype='object', name='fieldlabel', length=882)


In [79]:
# Remove columns with only one class
y_train_filtered = y_train.drop(columns=single_class_labels)
y_test_filtered = y_test.drop(columns=single_class_labels)


**Step 4:** Fitting the data to the model and checking accuracy

In [57]:
model.fit(X_train, y_train_filtered)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [58]:
# Evaluate the model
accuracy = accuracy_score(y_test_filtered, y_pred)
print(f"Accuracy of test set: {accuracy:.2f}")

accuracy1 = accuracy_score(y_train_filtered, y_pred_train)
print(f"Accuracy of train set: {accuracy1:.2f}")

Accuracy of test set: 0.15
Accuracy of train set: 0.95


In [59]:

# Evaluate precision and recall for the test set
precision_test = precision_score(y_test_filtered, y_pred, average='micro')
recall_test = recall_score(y_test_filtered, y_pred, average='micro')

print(f"Precision of test set: {precision_test:.2f}")
print(f"Recall of test set: {recall_test:.2f}")

# Evaluate precision and recall for the train set
precision_train = precision_score(y_train_filtered, y_pred_train, average='micro')
recall_train = recall_score(y_train_filtered, y_pred_train, average='micro')

print(f"Precision of train set: {precision_train:.2f}")
print(f"Recall of train set: {recall_train:.2f}")


Precision of test set: 0.06
Recall of test set: 0.12
Precision of train set: 1.00
Recall of train set: 0.99


**Training set performance**

- Accuracy 0.95
- Precision 1 and recall 0.99 indicates that model identifies nearly all true positives

**Test set performance**

- Accuracy 0.15 - struggles to generalize to unseen data
- Precision 0.06 and recall 0.12 means that model is not correctly identifying true positives and is misclassifying most of the unseen data
- Difference between train and test set metrics suggests overfitting, where model memorized the training data but fails to adjust to the new examples.



**Step 5:** Finetuning the hyperparameters

Using a parameter to automatically adjust the weights which should improve the overall recall and precision for imbalanced classes.

In [None]:
model = make_pipeline(
    CountVectorizer(),
    MultiOutputClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=10, max_features='sqrt', class_weight='balanced', random_state=42)
    )
)

In [None]:
# Fit the model on the training data
model.fit(X_train, y_train_filtered)

# Make predictions
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [None]:
# Evaluate the model - Accuracy
accuracy_test = accuracy_score(y_test_filtered, y_pred)
accuracy_train = accuracy_score(y_train_filtered, y_pred_train)

print(f"Accuracy of test set: {accuracy_test:.2f}")
print(f"Accuracy of train set: {accuracy_train:.2f}")

In [None]:
# Evaluate the model - Precision and Recall
precision_test = precision_score(y_test_filtered, y_pred, average='macro', zero_division=0)
recall_test = recall_score(y_test_filtered, y_pred, average='macro', zero_division=0)

precision_train = precision_score(y_train_filtered, y_pred_train, average='macro', zero_division=0)
recall_train = recall_score(y_train_filtered, y_pred_train, average='macro', zero_division=0)

print(f"Precision of test set: {precision_test:.2f}")
print(f"Recall of test set: {recall_test:.2f}")
print(f"Precision of train set: {precision_train:.2f}")
print(f"Recall of train set: {recall_train:.2f}")

**Test set results**

Accuracy of test set: 0.17

Precision of test set: 0.01

Recall of test set: 0.01

**Train set results**

Accuracy of train set: 0.62

Precision of train set: 0.64

Recall of train set: 0.60

 **Summary from finetuning**

- The finetuning improved the performance regarding precision and recall of the training data
- The model has improved quite a bit in handling imblanaces, but its still not perfect
- Precision 0.64 on train data tells us that model is very good at avoiding false positives
- Recall 0.6 tells us that model correctly identified 60% of the actual positive field labels (40% of false negatives)

### **Summary of Experiment 1:**

- Used Bag of Words in Multi-output classifier combined with Random Forest Classifier to handle complex relationships in the dataset
- Balancing class weights and adding more hyperparameters to the Random Forest Classifier has very much improved the precision and recall, as well as stabalized the accuracy of the train set
- Despite the model performing rather alright on the train data after finetuning, the test data set still miserably fails in terms of all performance metrics