In [173]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline


In [174]:
data = pd.read_csv("c:/Users/migle/Desktop/BPR/IPWFormAi/data/preprocessed_data.csv")
df = pd.DataFrame(data)
display(df)

Unnamed: 0,customform,name,fieldtype,fieldlabel,total_elements
0,876,Kundereklamation,listselect,Vælg kunde,30.0
1,876,Kundereklamation,string,Gadenavn,37.0
2,876,Kundereklamation,string,Postnr,246.0
3,876,Kundereklamation,string,By,91.0
4,876,Kundereklamation,string,Telefonnummer,206.0
...,...,...,...,...,...
514,37240,[Leverandør-/kundekort] Certifikat / Kontrakt ...,date,Gældende til,2.0
515,37758,[Systemtabel] Dokumenttype,string,Dokumenttype,16.0
516,37218,Leverandør-/kundekort,combo,Leverandør status,24.0
517,33652,[Systemtabel] Evaluering - Bradley kurve,businessunit,Forretningsenhed,4.0


In [175]:

# Group by customform and collect field labels
grouped_field_labels = df.groupby('customform')['fieldlabel'].apply(list)

# Display all custom forms and their corresponding field labels
print(grouped_field_labels)


customform
876      [Vælg kunde, Gadenavn, Postnr, By, Telefonnumm...
1089     [Oprettet, Oprettet af, Fejlkode, Vælg behandl...
1098     [Fejlkategori, Fejlkode, Tekst, Fejlkode - kat...
1220     [Oprettet af, Oprettet, Prioritet - UDGÅET, Be...
1376     [Oprettet af, Oprettelsesdato, Auditdato, Lede...
1401     [Oprettet, Oprettet af, Nummer, Evt. relateret...
1554                                               [Tekst]
1560                                               [Tekst]
1562                                        [Tekst, Tekst]
2198     [Kunde navn, E-mail, Reklamations nr., Kontakt...
2216                                               [Tekst]
2322                                      [Sags nr., Dato]
2455                                        [Fejlkategori]
2488     [Kundenummer, Kundenavn, Adresse 1, Adresse 2,...
3055                                               [Tekst]
3067     [Reklamations nr., Gadenavn, E-mail, Vare nr.,...
3124     [Afvigelses rapport nr., Løbe nr., V

Bag of words with the preprocessed dataset

The input of the model is the form NAME, which should help us to identify the FIELDLABEL which should be associated with a given name of a form.

In [176]:

y = df['fieldlabel']
X = df['name']


In [177]:

def optimize_logistic_regression(X, y):

    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

    classifier = LogisticRegression(solver='liblinear')

    # Set the parameters for GridSearchCV
    param_grid = {
        'penalty': ['l1', 'l2'],  # Regularization types
        'C': np.logspace(-3, 3, 7),  # Regularization strength
        'max_iter': [20, 50, 100], # Maximum number of iterations
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=classifier,
                               param_grid=param_grid,
                               scoring='accuracy',  
                               cv=5,  # Number of cross-validation folds
                               verbose=1,
                               n_jobs=-1)  # Use all available cores

    # Fit the model to find the best parameters
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Cross-Validation Score: {best_score}")

    best_classifier = grid_search.best_estimator_
    
    y_pred_test = best_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print(f"Test Accuracy: {test_accuracy}")
    
    y_pred_train = best_classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)

    print(f"Train Accuracy: {train_accuracy}")

    return grid_search.best_estimator_


In [178]:
best_classifier = optimize_logistic_regression(X, y)

Fitting 5 folds for each of 42 candidates, totalling 210 fits




Best Parameters: {'C': 1.0, 'max_iter': 20, 'penalty': 'l1'}
Best Cross-Validation Score: 0.04337349397590362
Test Accuracy: 0.009615384615384616
Train Accuracy: 0.0819277108433735




Multi-output classifier using the bag of words as input to the classifier

In [181]:
df['fieldlabel'] = df['fieldlabel'].str.strip()

In [191]:
y = df.pivot_table(index='name', columns='fieldlabel', aggfunc='size', fill_value=0).astype(int)

X = df['name']

print("Shape of X:", X.shape)  # Should match the number of unique names
print("Shape of y:", y.shape)   # Should match the number of unique names as well

display(y)

Shape of X: (519,)
Shape of y: (64, 334)


fieldlabel,1. Forhindrer eller reducerer den styrende foranstaltning risikoen til et acceptabelt niveau?,2. Er denne foranstaltning nødvendig (der er ikke andre effektive foranstaltninger),"3. Foranstaltningen er en PRP, som skal minimere sandsynligheden og er nødt til at blive overvåget jævnligt?","4. Er foranstaltningen en PRP, der har direkte indflydelse på den væsentlige risikofaktor?",Adresse,Adresse 1,Adresse 2,Afd. nr.,Afdeling,Afdeling for hændelsen,...,Årsag,Årsag myndighedssanktioner,Årsag til fejlens opståen,Årsag til reklamation,Årsag til reklamationen,Årsagsanalyse,Ændring af dokumentation,Økonomi +,Økonomi -,Økonomi ansl.
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afvigelse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Afvigelse - Fejlkategori,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Afvigelse - Fejlkode,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Afvigelse - opdaget,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Afvigelse - opstår,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[Systemtabel] Rammer og vilkår - Interne/eksterne forhold og interessenter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
amnj-afvigelsesblanket,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
amnj-fejltype,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
amnj-kundereklamation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


As we were unable to fit the data in the model, there were some issues that arose with the data. In the code snippet above, we can see that not all form names contain a fieldlabel of such, which was not visible when viewing if there are any null/nan values in the dataset. 

In [183]:
# Filter the dataset to show rows where 'fieldlabel' is null/NaN
missing_fieldlabels = df[df['fieldlabel'].isnull()]

# Display the relevant columns: 'customform', 'name', 'fieldlabel'
print(missing_fieldlabels[['customform', 'name', 'fieldlabel']])

Empty DataFrame
Columns: [customform, name, fieldlabel]
Index: []


**The problem is that the model cannot be build yet before further data preprocessing. Theres 519 form names, but only 64 "fieldlabels". Wheres we would like to use the 334, but thats something to figure out for the next time too. Also, it seems to have an issue with special characters and spaces in the dataset that was not encoded or considered in general. This has huge consequences with the results of previous models and ability to proceed with current task.**

In [190]:

#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

ValueError: Found input variables with inconsistent numbers of samples: [519, 64]

In [116]:
# Create the pipeline
model = make_pipeline(CountVectorizer(), MultiOutputClassifier(LogisticRegression()))

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check predictions
pred_df = pd.DataFrame(y_pred, columns=y.columns[1:])  # Skip the first column which is the name
print("Predictions on test set:")
print(pred_df)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

TypeError: '<' not supported between instances of 'str' and 'int'