# Neural Network pointwise
This model is created based on the following [post](https://www.tensorflow.org/tutorials/load_data/csv#mixed_data_types)

In [27]:
import pandas as pd
import numpy as np
from src import configuration as config
from src.pipeline.evaluation.evaluation_utils import custom_train_test_split
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
from tensorflow.keras import layers

In [28]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
# use custom train test split
X_train, X_test, y_train, y_test = custom_train_test_split(df, factors=["dataset", "model", "tuning", "scoring"], target="rank")
df = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)
print(df.dtypes)
df.head()

dataset      int64
model       object
tuning      object
scoring     object
encoder     object
rank       float64
dtype: object


Unnamed: 0,dataset,model,tuning,scoring,encoder,rank
0,1114,KNC,no,F1,BE,21.0
1,1114,KNC,no,F1,BUCV10RGLMME,19.0
2,1114,KNC,no,F1,BUCV10TE,26.0
3,1114,KNC,no,F1,BUCV2RGLMME,12.0
4,1114,KNC,no,F1,BUCV2TE,28.0


In [29]:
def create_encoder_rankings(df):
    # Group the DataFrame by 'dataset', 'model', 'tuning', and 'scoring' columns
    grouped_df = df.groupby(['dataset', 'model', 'tuning', 'scoring'])
    
    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder_rankings'])
    
    for group_keys, group_data in grouped_df:
        dataset, model, tuning, scoring = group_keys
        encoder_rankings = group_data.sort_values('rank', ascending=False)['encoder'].tolist()
        rankings = group_data.sort_values('rank', ascending=False)['rank'].tolist()
        new_row = {'dataset': dataset, 'model': model, 'tuning': tuning, 'scoring': scoring,
                   'encoder_rankings': [encoder_rankings], 'ranking': [rankings]}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    
    return new_df

df_listwise = create_encoder_rankings(df)
df_listwise.head()

Unnamed: 0,dataset,model,tuning,scoring,encoder_rankings,ranking
0,3,DTC,full,ACC,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
1,3,DTC,full,F1,"[[CBE, DE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
2,3,DTC,model,AUC,"[[DE, CBE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."
3,3,DTC,model,F1,"[[CBE, DE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."
4,3,KNC,model,AUC,"[[DE, CBE, CE, SE, OE, BE, OHE, MHE, PBTE01, C...","[[29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22..."


In [30]:
features = df.copy()
features['dataset'] = features['dataset'].astype(int)
labels = features.pop('rank')
features = features[['dataset', 'model', 'tuning', 'scoring']]
print(f"Datatypes: \n{features.dtypes}\n")

inputs = {}

for name, column in features.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

Datatypes: 
dataset     int32
model      object
tuning     object
scoring    object
dtype: object



{'dataset': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dataset')>,
 'model': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'model')>,
 'tuning': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'tuning')>,
 'scoring': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'scoring')>}

In [31]:
# The first step in your preprocessing logic is to concatenate the numeric inputs together, and run them through a normalization layer:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}
print(numeric_inputs)
x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(features[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

{'dataset': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dataset')>}


<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_2')>

In [32]:
# For the string inputs use the tf.keras.layers.StringLookup function to map from strings to integer indices in a vocabulary. Next, use tf.keras.layers.CategoryEncoding to convert the indexes into float32 data appropriate for the model.
preprocessed_inputs = [all_numeric_inputs]
for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = layers.StringLookup(vocabulary=np.unique(features[name]))
  one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)

In [33]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

tf.keras.utils.plot_model(model = preprocessing , rankdir="LR", dpi=72, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [34]:
tmp_features_dict = {name: np.array(value) 
                         for name, value in features.items()}

# Slice out the first training example and pass it to this preprocessing model, you see the numeric features and string one-hots all concatenated together:
features_dict = {name:values[:1] for name, values in tmp_features_dict.items()}
preprocessing(features_dict)

<tf.Tensor: shape=(1, 15), dtype=float32, numpy=
array([[-0.8266589,  0.       ,  0.       ,  1.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  0.       ,
         1.       ,  0.       ,  0.       ,  0.       ,  1.       ]],
      dtype=float32)>

In [35]:
def pointwise_model(preprocessing_head, inputs):
  body = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(1)
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam())
  return model

pointwise_model = pointwise_model(preprocessing, inputs)

In [36]:
pointwise_model.fit(x=tmp_features_dict, y=labels, epochs=10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x24975ff78d0>

In [39]:
from src.pipeline.evaluation import evaluation_utils as er

factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
# Test the model with the df_test data
body = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(1)
  ])

preprocessed_inputs = preprocessing(df_test)
result = body(preprocessed_inputs)
y_pred = pointwise_model.predict(result)

df_pred = pd.concat([X_test, y_test, y_pred], axis=1)
# ---- convert to rankings and evaluate
rankings_test = er.get_rankings(
    df_pred,
    factors=factors,
    new_index=new_index,
    target="cv_score"
)
rankings_pred = er.get_rankings(
    df_pred,
    factors=factors,
    new_index=new_index,
    target="cv_score_pred"
)
score = er.average_spearman(
    rankings_test,
    rankings_pred
)

ValueError: Layer "model_4" expects 4 input(s), but it received 1 input tensors. Inputs received: [      dataset  model tuning scoring       encoder  rank
0       43098    KNC   full     AUC            BE  25.0
1       43098    KNC   full     AUC  BUCV10RGLMME  19.0
2       43098    KNC   full     AUC      BUCV10TE  21.0
3       43098    KNC   full     AUC   BUCV2RGLMME  11.0
4       43098    KNC   full     AUC       BUCV2TE   0.0
...       ...    ...    ...     ...           ...   ...
9060    43897  LGBMC     no     ACC        PBTE01   0.0
9061    43897  LGBMC     no     ACC        RGLMME   0.0
9062    43897  LGBMC     no     ACC            SE   0.0
9063    43897  LGBMC     no     ACC            TE   0.0
9064    43897  LGBMC     no     ACC          WOEE   0.0

[9065 rows x 6 columns]]

In [None]:
# Since the preprocessing is part of the model, you can save the model and reload it somewhere else and get identical results:
pointwise_model.save('pointwise_model')

INFO:tensorflow:Assets written to: pointwise_model\assets


INFO:tensorflow:Assets written to: pointwise_model\assets


In [22]:
reloaded = tf.keras.models.load_model('pointwise_model')

ValueError: Exception encountered when calling layer "concatenate_2" (type Concatenate).

A merge layer should be called on a list of inputs. Received: inputs=Tensor("Placeholder:0", shape=(None, 1), dtype=float32) (not a list of tensors)

Call arguments received by layer "concatenate_2" (type Concatenate):
  • inputs=tf.Tensor(shape=(None, 1), dtype=float32)