In [13]:
import pandas as pd
import numpy as np
from src import configuration as config
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
from tensorflow.keras import layers

In [14]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
print(df.dtypes)
df.head()

dataset      int64
model       object
tuning      object
scoring     object
encoder     object
rank       float64
dtype: object


Unnamed: 0,dataset,model,tuning,scoring,encoder,rank
0,1169,KNC,model,ACC,BUCV2RGLMME,16.0
1,1169,KNC,model,ACC,BUCV2TE,14.0
2,1169,KNC,model,ACC,CBE,22.0
3,1169,KNC,model,ACC,CE,23.0
4,1169,KNC,model,ACC,CV10RGLMME,7.0


In [15]:
def create_encoder_rankings(df):
    # Group the DataFrame by 'dataset', 'model', 'tuning', and 'scoring' columns
    grouped_df = df.groupby(['dataset', 'model', 'tuning', 'scoring'])
    
    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder_rankings'])
    
    for group_keys, group_data in grouped_df:
        dataset, model, tuning, scoring = group_keys
        encoder_rankings = group_data.sort_values('rank', ascending=False)['encoder'].tolist()
        rankings = group_data.sort_values('rank', ascending=False)['rank'].tolist()
        new_row = {'dataset': dataset, 'model': model, 'tuning': tuning, 'scoring': scoring,
                   'encoder_rankings': [encoder_rankings], 'ranking': [rankings]}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    
    return new_df

df_listwise = create_encoder_rankings(df)
df_listwise.head()

Unnamed: 0,dataset,model,tuning,scoring,encoder_rankings,ranking
0,3,DTC,full,ACC,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
1,3,DTC,full,AUC,"[[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
2,3,DTC,full,F1,"[[CBE, DE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...","[[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,..."
3,3,DTC,model,AUC,"[[DE, CBE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."
4,3,DTC,model,F1,"[[CBE, DE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...","[[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18..."


In [16]:
features = df.copy()
features['dataset'] = features['dataset'].astype(int)
labels = features.pop('rank')
features = features[['dataset', 'model', 'tuning', 'scoring']]
print(f"Datatypes: \n{features.dtypes}\n")

inputs = {}

for name, column in features.items():
  dtype = column.dtype
  if dtype == object:
    dtype = tf.string
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

Datatypes: 
dataset     int32
model      object
tuning     object
scoring    object
dtype: object



{'dataset': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dataset')>,
 'model': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'model')>,
 'tuning': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'tuning')>,
 'scoring': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'scoring')>}

In [17]:
# The first step in your preprocessing logic is to concatenate the numeric inputs together, and run them through a normalization layer:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}
print(numeric_inputs)
x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(features[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

{'dataset': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dataset')>}


<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_1')>

In [18]:
# For the string inputs use the tf.keras.layers.StringLookup function to map from strings to integer indices in a vocabulary. Next, use tf.keras.layers.CategoryEncoding to convert the indexes into float32 data appropriate for the model.
preprocessed_inputs = [all_numeric_inputs]
for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = layers.StringLookup(vocabulary=np.unique(features[name]))
  one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)

In [19]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [20]:
tmp_features_dict = {name: np.array(value) 
                         for name, value in features.items()}

# Slice out the first training example and pass it to this preprocessing model, you see the numeric features and string one-hots all concatenated together:
features_dict = {name:values[:1] for name, values in tmp_features_dict.items()}
titanic_preprocessing(features_dict)

<tf.Tensor: shape=(1, 15), dtype=float32, numpy=
array([[-0.8206538,  0.       ,  0.       ,  1.       ,  0.       ,
         0.       ,  0.       ,  0.       ,  0.       ,  1.       ,
         0.       ,  0.       ,  1.       ,  0.       ,  0.       ]],
      dtype=float32)>

In [26]:

body = tf.keras.Sequential([
  layers.Dense(64),
  layers.Dense(1)
])

preprocessed_inputs = titanic_preprocessing(inputs)
result = body(preprocessed_inputs)
result
  

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'sequential_4')>

In [23]:
titanic_model.fit(x=tmp_features_dict, y=labels, epochs=1)

NotImplementedError: in user code:

    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\tensorflow_recommenders\models\base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "c:\Users\Marco\Workspace\phase-2\venv\Lib\site-packages\tensorflow_recommenders\models\base.py", line 61, in compute_loss
        raise NotImplementedError(

    NotImplementedError: Implementers must implement the `compute_loss` method.
