Table of contents:
1. Settings
    1. data settings
    2. specify hyperparameter grid
2. Data preparation
3. Hyperpara tuning
    1. CF
    2. CF+SR
4. Experiment
    1. CF
    2. CF+SR
5. Results
6. Write results to .txt file

In [33]:
import pandas as pd
import numpy as np
from myFunctions import read_data, prep
from myFunctions import metrics_sparse, create_initial_matrix, df_to_piv, piv_to_R, matrix_factorization, hyperpara_tuning, spearman, kendall, sim
from myFunctions import load_SM, calc_SM
from myFunctions import print_results, results_to_txt
from datetime import datetime
import random as rnd
import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)
rnd.seed(0)

# 1. Settings

1.1 Data settings. Selection and split.

In [34]:
#Select input data:
toy_data = False    #contents make no sense - just to test code
dataset_1 = False   #not publicly available
dataset_2 = False   #not publicly available
dataset_3 = True

datasets = [toy_data,dataset_1,dataset_2,dataset_3]
df, cov_array_personal, cov_array_personal_num, cov_array_personal_cat = read_data(datasets)

In [35]:
#Train/val/test split:
test_size = 0.25
out_of_time = 1         #1 for out of time split. 0 otherwise
one_in_train_val = 1    #1 if each case needs at least 1 observation in train_val set. 0 otherwise

1.2 Specify hyperpara settings. Tuning: yes/no. Specify grid.

In [36]:
#Tune hyperparameters. True: run full grid search (takes multiple hours). False: use optimal hyperparameters for dataset_3.
tune_hyperpara_CF = False     #if True: takes multiple hours
tune_hyperpara_CF_SR = False  #if True: takes multiple hours

In [37]:
#Similarity matrix: load or calculate. Calculating might take >1hr, depending on dataset used.
load_similarity_matrix = True #True: load matrix. False: calculate and save matrix.

In [38]:
#Define Hyperpara grid:
lambda_array = [0.01,0.02,0.05]
beta_array = [0,0.05,0.1,0.2,0.5]
alpha_array = [0.0001,0.001]
L_array = [5,10,15]
steps_array = [500,1000,2000,3000]

In [39]:
#Set default values of hyperparameters if these are not tuned:
lambda_optimal_cf, beta_optimal_cf, alpha_optimal_cf, L_optimal_cf, steps_optimal_cf = 0.05,0,0.001,10,1000     #optimal hyperparas for CF on dataset_3
lambda_optimal_cf_sr, beta_optimal_cf_sr, alpha_optimal_cf_sr, L_optimal_cf_sr, steps_optimal_cf_sr = 0.02,0.1,0.001,5,3000 #optimal hyperparas for CF_SR on dataset_3

# 2. Preparation

In [40]:
#start time experiment:
start_time_experiment = datetime.now()

In [41]:
#From eventlog, select target columns
values_log = 'objective'
index_log = ['case:concept:name']
columns_log = ['concept:name']

In [42]:
df, df_train_val, df_train, df_val, df_test = prep(df,test_size=0.25,out_of_time=out_of_time,one_in_train_val=one_in_train_val)

In [43]:
piv, piv_train_val,piv_train,piv_val,piv_test = df_to_piv(df,df_train_val,df_train,df_val,df_test)

In [44]:
R,R_train_val,R_train,R_val,R_test = piv_to_R(piv, piv_train_val,piv_train,piv_val,piv_test)

In [45]:
#Read users and item indices (to later reconstruct the ratings matrix)
empl = piv.index.values.tolist()
jobs = piv.columns.values.tolist()

#Details on dataset:
print(f'the size of the dataframe is: {df.size}')
print(f'amount of employees: {len(empl)}')
print(f'amount of jobs: {len(jobs)}')

the size of the dataframe is: 53600
amount of employees: 1238
amount of jobs: 209


In [46]:
#The vector on which similarities between employees are calculated
cov_array_personal

['V06', 'V08']

# 3. Hyperpara tuning

In [47]:
#initialization for matrix factorization
N = len(R)      # N: num of employees
M = len(R[0])   # M: num of jobs

In [48]:
#Configure similarity matrix. Either Load or Calculate & Save
if load_similarity_matrix:
    similarity_matrix = load_SM(datasets)
else:
    similarity_matrix = calc_SM(datasets,df,N,cov_array_personal,cov_array_personal_cat,cov_array_personal_num)

In [49]:
#if True, overwrite previously assigned values
if tune_hyperpara_CF:
    lambda_optimal_cf, beta_optimal_cf, alpha_optimal_cf, L_optimal_cf, steps_optimal_cf = hyperpara_tuning(R_train=R_train,R_val=R_val,sr=0,objective=1,sm=similarity_matrix,lambda_array=lambda_array, beta_array=beta_array, alpha_array=alpha_array, L_array=L_array, steps_array=steps_array)

In [50]:
#if True, overwrite previously assigned values
if tune_hyperpara_CF_SR:
    lambda_optimal_cf_sr, beta_optimal_cf_sr, alpha_optimal_cf_sr, L_optimal_cf_sr, steps_optimal_cf_sr = hyperpara_tuning(R_train=R_train,R_val=R_val,sr=1,objective=1,sm=similarity_matrix,lambda_array=lambda_array, beta_array=beta_array, alpha_array=alpha_array, L_array=L_array, steps_array=steps_array)

In [51]:
print("optimal hyperparameters for CF:")
print("lambda: {:.3f} beta: {:.3f} alpha: {:.3f} L: {:.3f} steps: {:.3f}".format(lambda_optimal_cf, beta_optimal_cf, alpha_optimal_cf, L_optimal_cf, steps_optimal_cf))

print("optimal hyperparameters for CF_SR:")
print("lambda: {:.3f} beta: {:.3f} alpha: {:.3f} L: {:.3f} steps: {:.3f}".format(lambda_optimal_cf_sr, beta_optimal_cf_sr, alpha_optimal_cf_sr, L_optimal_cf_sr, steps_optimal_cf_sr))

optimal hyperparameters for CF:
lambda: 0.050 beta: 0.000 alpha: 0.001 L: 10.000 steps: 1000.000
optimal hyperparameters for CF_SR:
lambda: 0.020 beta: 0.100 alpha: 0.001 L: 5.000 steps: 3000.000


# 4. Experiment

In [52]:
P_ini_cf, Q_ini_cf = create_initial_matrix(N, M, L_optimal_cf)
P_ini_cf_sr, Q_ini_cf_sr = create_initial_matrix(N, M, L_optimal_cf_sr)

4.1 Method 1: CF

In [53]:
#runtime for dataset_3: +-10 minutes
nP_cf, nQ_cf = matrix_factorization(
    R=R_train,
    P=P_ini_cf,
    Q=Q_ini_cf,
    K=L_optimal_cf,
    steps=steps_optimal_cf,
    alpha=alpha_optimal_cf,
    lambda1=lambda_optimal_cf,
    beta=beta_optimal_cf,
    SR=False,
    similarity_matrix=None)
nR_cf = np.dot(nP_cf, nQ_cf.T)

  0%|          | 0/1000 [00:00<?, ?it/s]

4.2 Method 2: CF with SR

In [54]:
#runtime for dataset_3: +-45 minutes
nP_cf_sr, nQ_cf_sr = matrix_factorization(
    R_train,
    P_ini_cf_sr,
    Q_ini_cf_sr,
    K=L_optimal_cf_sr,
    steps=steps_optimal_cf_sr,
    alpha=alpha_optimal_cf_sr,
    lambda1=lambda_optimal_cf_sr,
    beta=beta_optimal_cf_sr,
    SR=True,
    similarity_matrix=similarity_matrix)
nR_cf_sr = np.dot(nP_cf_sr, nQ_cf_sr.T)

  0%|          | 0/3000 [00:00<?, ?it/s]

# 5. Results

In [55]:
print('\n** pred CF  ** ')
print_results(nR_cf,R_test)

print('\n** pred CF+SR ** ')
print_results(nR_cf_sr,R_test)

print('\n** pred random ** ')
R_rand = np.random.rand(N,M)
print_results(R_rand,R_test,)

print('\n** pred 0.5 ** ')
R_05 = np.full([N,M],0.5)
print_results(R_05,R_test)



** pred CF  ** 
mae: 0.1542
rmse: 0.2831
spearman: 0.2318
kendall: 0.2112

** pred CF+SR ** 
mae: 0.1548
rmse: 0.2175
spearman: 0.5106
kendall: 0.4807

** pred random ** 
mae: 0.3088
rmse: 0.3711
spearman: -0.1166
kendall: -0.1099

** pred 0.5 ** 
mae: 0.1865
rmse: 0.2302
spearman: nan
kendall: nan


(0.1865, 0.2302, nan, nan)

# 6. Write results to text file

In [56]:
#Run cell to write a summary of results to Results/results_YYYY-mm-dd HH-MM-SS.txt
results_to_txt(nR_cf, nR_cf_sr, R_test, start_time_experiment, toy_data, dataset_1, dataset_2, dataset_3, cov_array_personal, test_size, out_of_time, one_in_train_val, lambda_array, beta_array, alpha_array, L_array, steps_array, lambda_optimal_cf, beta_optimal_cf, L_optimal_cf, alpha_optimal_cf, steps_optimal_cf, lambda_optimal_cf_sr, beta_optimal_cf_sr, alpha_optimal_cf_sr, L_optimal_cf_sr, steps_optimal_cf_sr)