In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score as sk_f1_score
from tensorflow.keras.models import clone_model

import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K

In [21]:
FILE_PATH='data/riasec/labeled-data.csv'
df = pd.read_csv(FILE_PATH)

feature_columns = [f'{col}{i}' for col in 'RIASEC' for i in range(1, 9)]
x = df[feature_columns]

label_columns = ['R_Prob','I_Prob','A_Prob','S_Prob','E_Prob','C_Prob']
y = (df[label_columns] >= 0.2).astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f"Data training shape\t: {x_train.shape[:]}")
print(f"Data testing shape\t: {x_test.shape[:]}")
print(f"Label training shape\t: {y_train.shape[:]}")
print(f"Label testing shape\t: {y_test.shape[:]}")

Data training shape	: (108611, 48)
Data testing shape	: (27153, 48)
Label training shape	: (108611, 6)
Label testing shape	: (27153, 6)


In [None]:
def f1_score(y_true, y_pred):
    y_pred_classes = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred_classes, 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred_classes, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred_classes), 'float'), axis=0)

    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return K.mean(f1)

In [None]:
def experiment_with_different_splits(x, y, model, lr, f1_score):
    results = {}
    for rate in lr:
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
        cloned_model = clone_model(model)
        cloned_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=rate),
              loss='binary_crossentropy',
              metrics=[f1_score])
        # cloned_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        cloned_model.fit(x_train, y_train, epochs=100, batch_size=64, verbose=0)
        y_pred = cloned_model.predict(x_test)
        y_pred_classes = (y_pred > 0.5).astype(int)
        f1 = sk_f1_score(y_test, y_pred_classes, average='weighted')

        results[rate] = f1
    return results

In [24]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(48,)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01), name='dense_layer_1'),
    tf.keras.layers.Dropout(0.5, name='dropout_layer_1'),
    tf.keras.layers.Dense(32, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.01), name='dense_layer_2'),
    tf.keras.layers.Dropout(0.3, name='dropout_layer_2'),
    tf.keras.layers.Dense(6, activation='sigmoid', name='output_layer')
])

In [28]:
split_ratios = [0.000001, 0.00001, 0.0001, 0.001]
results = experiment_with_different_splits(x, y, model, split_ratios, f1_score)
print(results)

{1e-06: 0.8000203395459697, 1e-05: 0.932527103554112, 0.0001: 0.9864148015894287, 0.001: 0.9725614581307646}
