In [3]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential   

In [4]:
df = pd.read_csv('data/all_data.csv', on_bad_lines='skip')
#handle blocktype(removeit).
indices_to_drop = df[df['elementtype'] == 'block'].index
df = df.drop(indices_to_drop)

#put specialtype in place.
for index, row in df.iterrows():
    if row['elementtype'] == 'special' and pd.isnull(row['fieldtype']):  # Check if 'elementtype' is empty
        df.at[index, 'fieldtype'] = row['specialtype']  # Assign 'specialtype' value to 'elementtype'

df = df.drop(columns=['Kunde','fieldtype','customform','specialtype','elementtype', 'customname','fieldobjectid','fieldparentid','fieldparenttype','blocktype','fieldrelation','language'])
display(df)

Unnamed: 0,name,fieldlabel
0,Kundereklamation,Vælg kunde
2,Kundereklamation,Gadenavn
3,Kundereklamation,Postnr
4,Kundereklamation,By
5,Kundereklamation,Telefonnummer
...,...,...
10330,Intern IT Support,"Registrerer du tid, kan du bruge flg. opg.nr: ..."
10331,Azure - Udløb af client secret,Ansvarlig for opsætning
10333,Leverancer,Installationen oprettes på domænet xxxx.ipw.dk
10334,Leverancer,Installationen oprettes på domænet <b>xxxx.ipw...


In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K

# Sample DataFrame (use your actual DataFrame here)
# Assuming you have a DataFrame 'df' with 'name' and 'fieldlabel' columns

# Step 1: Group field labels by name
df_grouped = df.groupby('name')['fieldlabel'].apply(list).reset_index()

# Step 2: Tokenize names (char-level tokenization)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df_grouped['name'])
sequences = tokenizer.texts_to_sequences(df_grouped['name'])

# Step 3: Pad sequences to ensure equal length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Step 4: One-Hot Encode field labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(df_grouped['fieldlabel'])  # Multi-hot encoding
num_classes = encoded_labels.shape[1]  # Number of possible field labels

# Step 5: Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model.add(Conv1D(256, 5, activation='relu'))  # Increase the number of filters
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, 3, activation='relu'))  # Adding an additional Conv1D layer
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.01)))  # Add Dense layer
model.add(Dropout(0.5))  # Add Dropout to reduce overfitting
model.add(Dense(num_classes, activation='sigmoid'))  # Sigmoid for multi-label
# Sigmoid for multi-label classification

optimizer = Adam(learning_rate=0.002) 
# Step 6: Compile the model for multi-label classification

def f1_score(y_true, y_pred):
    # Cast both y_true and y_pred to float32
    y_true = K.cast(y_true, 'float32')
    y_pred = K.cast(y_pred, 'float32')

    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        return true_positives / (possible_positives + K.epsilon())

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        return true_positives / (predicted_positives + K.epsilon())

    precision_value = precision(y_true, y_pred)
    recall_value = recall(y_true, y_pred)
    return 2 * ((precision_value * recall_value) / (precision_value + recall_value + K.epsilon()))


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', f1_score])

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=42)

# Step 8: Train the model
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Step 9: Evaluate the model on the test set
evaluation = model.evaluate(X_test, y_test)

# Print evaluation results
print(f'Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}')


Epoch 1/30




[1m 9/21[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 51ms/step - accuracy: 0.0026 - f1_score: 0.0039 - loss: 2.5728

In [20]:
def predict_name(name, model, tokenizer, encoder, max_length, threshold=0.5):
    # Tokenize and pad the input name
    sequence = tokenizer.texts_to_sequences([name])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    
    # Predict using the trained mod0.018430382013320923el
    prediction = model.predict(padded_sequence)
    
    # Get the predicted class labels (apply a threshold for multi-label)
    predicted_labels = (prediction > threshold).astype(int)
    
    # Check if the prediction is all zeros (no class predicted)
    if not predicted_labels.any():
        return "No class predicted with the given threshold"
    
    # Decode the predicted labels back to the original field labels
    decoded_labels = encoder.inverse_transform(predicted_labels)
    
    return decoded_labels

# Example usage
predicted_output = predict_name("Kundereklamation", model, tokenizer, encoder, max_length)
print(f"Predicted fields for 'risiko': {predicted_output}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
Predicted fields for 'risiko': No class predicted with the given threshold
